: Glimmer minimal gene length (range 30-120, only for FASTA inputs, default: 90)\n--taxon : Taxonomy: prokaryotic / eukaryotic (default: p)\n--cores : Number of parallel CPUs to use for threading (default: all)\n--clusterblast : Include ClusterBlast gene cluster comparison analysis (default:y)\n--smcogs : Include smCOG analysis for functional prediction of genes (default:y)\n--fullblast : Include genome-wide BLAST analysis (default:n)\n--fullhmm : Include genome-wide PFAM HMM analysis (default:n)\n--blastdbpath : Specify folder containing CLUSEAN blast database (default:clusean/db)\n--pfamdbpath : Specify folder containing PFAM database (default:clusean/db)\n--geneclustertypes : Gene cluster types to scan for (default:1):\n1 = all\n2 = type I polyketide synthases\n3 = type II polyketide synthases\n4 = type III polyketide synthases\n5 = nonribosomal peptide synthetases\n6 = terpene synthases\n7 = lantibiotics\n8 = bacteriocins\n9 = beta-lactams\n10 = aminoglycosides / aminocyclitols\n11 = aminocoumarins\n12 = siderophores\n13 = ectoines\n14 = butyrolactones\n15 = indoles\n16 = nucleosides\n17 = phosphoglycolipids\n18 = melanins\n19 = others\n--help : this help screen\n"
+ #Check input file format
+ if len(sys.argv) < 2 or len(sys.argv[1]) < 1:
+ print >> sys.stderr, "Please supply valid name for input file."
+ print "Usage: antismash [options]"
+ print "From the command line, input antismash --help for more information."
+ logfile.write("Input format error. Please supply valid name for infile.\n")
+ logfile.write("Usage: antismash [options]\n")
+ logfile.write("From the command line, input antismash --help for more information.\n")
+ logfile.close()
+ sys.exit(1)
+ if sys.argv[1] != "--help":
+ if len(sys.argv[1].split(".")) < 2 or (sys.argv[1].split(".")[-1] != "embl" and sys.argv[1].split(".")[-1] != "EMBL" and sys.argv[1].split(".")[-1] != "emb" and sys.argv[1].split(".")[-1] != "EMB" and sys.argv[1].split(".")[-1] != "genbank" and sys.argv[1].split(".")[-1] != "GENBANK" and sys.argv[1].split(".")[-1] != "gbk" and sys.argv[1].split(".")[-1] != "GBK" and sys.argv[1].split(".")[-1] != "gb" and sys.argv[1].split(".")[-1] != "GB" and sys.argv[1].split(".")[-1] != "fasta" and sys.argv[1].split(".")[-1] != "FASTA" and sys.argv[1].split(".")[-1] != "fas" and sys.argv[1].split(".")[-1] != "FAS" and sys.argv[1].split(".")[-1] != "fa" and sys.argv[1].split(".")[-1] != "FA"):
+ print >> sys.stderr, "No EMBL/GBK/FASTA file submitted as input. Please supply a valid file with .embl / .gbk / .fasta extension. "
+ print "Usage: antismash [options]"
+ print "From the command line, input antismash --help for more information."
+ logfile.write("Input format error. Please supply a valid file with .embl / .gbk / .fasta extension.\n")
+ logfile.write("Usage: antismash [options]\n")
+ logfile.write("From the command line, input antismash --help for more information.\n")
+ logfile.close()
+ sys.exit(1)
+ #Define input filename and make fixes if necessary
+ infile = sys.argv[1]
+ try:
+ testfile = open(infile,"r").read()
+ except(IOError):
+ print >> sys.stderr, "Please supply valid name for input file."
+ print "Usage: antismash [options]"
+ print "From the command line, input antismash --help for more information."
+ logfile = open("antismash.log","w")
+ logfile.write("Input format error. Please supply valid name for infile.\n")
+ logfile.write("Usage: antismash [options]\n")
+ logfile.write("From the command line, input antismash --help for more information.\n")
+ logfile.close()
+ sys.exit(1)
+ #Parse absolute paths if found
+ absolutepath = "n"
+ if "/" in infile or "\\" in infile:
+ absolutepath = "y"
+ lastpos1 = infile.rfind("\\")
+ lastpos2 = infile.rfind("/")
+ lastpos = max([lastpos1,lastpos2])
+ originpath = infile[:(lastpos + 1)]
+ infile = infile[(lastpos + 1):]
+ if sys.platform == ('win32'):
+ copycommand = 'copy/y "' + originpath + infile + '" ' + infile + ' > nul'
+ os.system(copycommand)
+ if sys.platform == ('linux2'):
+ copycommand = 'cp ' + originpath + infile + " . > /dev/null"
+ os.system(copycommand)
+ #genomename = ".".join(infile.split(".")[:-1])
+ #for i in genomename:
+ # if i in '!"#$%&()*+,./:;=>?@[]^`{|}' or i in "'":
+ # genomename = genomename.replace(i,"")
+ # if "/" in genomename:
+ # genomename = genomename.rpartition("/")[2]
+ # if "\\" in genomename:
+ # genomename = genomename.rpartition("\\")[2]
+ genomename = os.path.splitext(os.path.basename(infile))[0]
+ if sys.platform == ('linux2'):
+ if genomename != infile.split(".")[-2]:
+ oldinfile = infile.replace("(","\\(").replace(")","\\)").replace("*","\\*").replace("&","\\&").replace("!","\\!").replace("$","\\$").replace("{","\\{").replace("}","\\}").replace("|","\\|").replace("`","\\`").replace("'","\\'").replace('"','\\"').replace('?','\\?')
+ infile = genomename + "." + infile.split(".")[-1]
+ if "/" in genomename:
+ genomename = genomename.rpartition("/")[2]
+ if "\\" in genomename:
+ genomename = genomename.rpartition("\\")[2]
+ os.system("cp " + oldinfile + " " + infile)
+ #Define outputfolder
+ if absolutepath == "y":
+ if sys.platform == ('win32'):
+ dir1 = os.popen("dir/w/ad " + originpath)
+ dir2 = os.popen("dir/w/ad")
+ dir1 = dir1.read()
+ dir2 = dir2.read()
+ if sys.platform == ('linux2'):
+ dir1 = os.popen("ls")
+ dir2 = os.popen("ls " + originpath)
+ dir1 = dir1.read()
+ dir2 = dir2.read()
+ parts = dir1.split(" ") + dir2.split(" ")
+ else:
+ if sys.platform == ('win32'):
+ dir = os.popen("dir/w/ad")
+ dir = dir.read()
+ if sys.platform == ('linux2'):
+ dir = os.popen("ls")
+ dir = dir.read()
+ parts = dir.split(" ")
+ parts2 = []
+ for i in parts:
+ partparts = i.split("\n")
+ for i in partparts:
+ i = i.replace("[","")
+ i = i.replace("]","")
+ parts2.append(i)
+ parts = parts2
+ oldgenomename = genomename
+ if genomename in parts:
+ genomename = genomename + "_" + str(0)
+ while genomename in parts:
+ finalpart = genomename.split("_")[-1]
+ allnumbers = "y"
+ for i in finalpart:
+ if i not in ["0","1","2","3","4","5","6","7","8","9"]:
+ allnumbers = "n"
+ if allnumbers == "y" and int(finalpart) in range(0,1000):
+ newgenomename = ""
+ for i in genomename.split("_")[:-1]:
+ newgenomename = newgenomename + "_" + i
+ newgenomename = newgenomename + "_" + str(int(finalpart) + 1)
+ genomename = newgenomename[1:]
+ genomename = genomename.replace("__","_")
+ #Output results folder name for output checking by GUI
+ resultslocfile = open("resultsfolder.txt","w")
+ resultslocfile.write(os.getcwd() + os.sep + genomename)
+ resultslocfile.close()
+ #Implement defaults
+ glimmertransl_table = str(1)
+ genomeconf = "l"
+ minglength = str(90)
+ cores = "all"
+ taxon = "p"
+ clusterblast = "y"
+ smcogs = "y"
+ fullblast = "n"
+ fullhmm = "n"
+ if sys.platform == ('win32'):
+ blastdbpath = '"' + os.getcwd() + "/clusean/db" + '"'
+ if sys.platform == ('linux2'):
+ blastdbpath = os.getcwd() + "/clusean/db"
+ if sys.platform == ('win32'):
+ pfamdbpath = '"' + os.getcwd() + "/clusean/db/" + '"'
+ if sys.platform == ('linux2'):
+ pfamdbpath = os.getcwd() + "/clusean/db/"
+ geneclustertypes = [1]
+ #Read user-specified options which may override defaults
+ if len(sys.argv) > 2 or sys.argv[1] == "--help":
+ options = sys.argv
+ if "--" in options[-1] and sys.argv[1] != "--help":
+ invalidoptions(options[-1])
+ #identify option identifiers
+ identifiers = []
+ for i in options:
+ if "--" in i:
+ if i not in identifiers:
+ identifiers.append(i)
+ else:
+ invalidoptions("No '--' in given options or option given twice.")
+ for i in identifiers:
+ if i != "--help":
+ value = options[options.index(i) + 1].strip()
+ if i == "--gtransl":
+ for k in value:
+ if k not in ["0","1","2","3","4","5","6","7","8","9"]:
+ invalidoptions(i + "input is no number")
+ if int(value) in range(1,24) and int(value) != 7 and int(value) != 8 and int(value) != 17 and int(value) != 18 and int(value) != 19 and int(value) != 20:
+ glimmertransl_table = value
+ else:
+ invalidoptions(i)
+ elif i == "--genomeconf":
+ if value == "l" or value == "c":
+ genomeconf = value
+ else:
+ invalidoptions(i)
+ elif i == "--minglength":
+ for k in value:
+ if k not in ["0","1","2","3","4","5","6","7","8","9"]:
+ invalidoptions(i)
+ if int(value) in range(30,91):
+ minglength = value
+ else:
+ print >> sys.stderr, "Invalid options input: minimal gene length should be a number between 30-90."
+ logfile = open("antismash.log","w")
+ logfile.write("Invalid options input: minimal gene length should be a number between 30-90.\n")
+ logfile.close()
+ sys.exit(1)
+ elif i == "--cores":
+ for k in value:
+ if k not in ["0","1","2","3","4","5","6","7","8","9"]:
+ invalidoptions(i)
+ if int(value) in range(1,1000):
+ cores = int(value)
+ else:
+ invalidoptions(i)
+ elif i == "--taxon":
+ if value == "p" or value == "e":
+ taxon = value
+ else:
+ invalidoptions(i)
+ elif i == "--clusterblast":
+ if value == "y" or value == "n":
+ clusterblast = value
+ else:
+ invalidoptions(i)
+ elif i == "--smcogs":
+ if value == "y" or value == "n":
+ smcogs = value
+ else:
+ invalidoptions(i)
+ elif i == "--fullblast":
+ if value == "y" or value == "n":
+ fullblast = value
+ else:
+ invalidoptions(i)
+ elif i == "--fullhmm":
+ if value == "y" or value == "n":
+ fullhmm = value
+ else:
+ invalidoptions(i)
+ elif i == "--glimmer_prediction":
+ glimmer_prediction_path = value
+ elif i == "--blastdbpath":
+ if sys.platform == ('win32'):
+ if options[options.index(i) + 1][0] != '"':
+ value = '"' + options[options.index(i) + 1] + '"'
+ else:
+ value = options[options.index(i) + 1]
+ if ":\\" in value:
+ blastdbpath = value
+ elif "\\" in value or "/" in value:
+ if value[0] == "\\" or value[0] == "/":
+ blastdbpath = os.getcwd() + value
+ else:
+ blastdbpath = os.getcwd() + "\\" + value
+ else:
+ blastdbpath = os.getcwd() + "\\" + value
+ if sys.platform == ('linux2'):
+ value = options[options.index(i) + 1]
+ if "\\" in value or "/" in value:
+ value = value.replace("\\","/")
+ if value[0] == "/":
+ blastdbpath = value
+ else:
+ blastdbpath = os.getcwd() + "/" + value
+ else:
+ blastdbpath = os.getcwd() + "/" + value
+ elif i == "--pfamdbpath":
+ if sys.platform == ('win32'):
+ if options[options.index(i) + 1][0] != '"':
+ value = '"' + options[options.index(i) + 1] + '"'
+ else:
+ value = options[options.index(i) + 1]
+ if ":\\" in value:
+ pfamdbpath = value
+ elif "\\" in value or "/" in value:
+ if value[0] == "\\" or value[0] == "/":
+ pfamdbpath = os.getcwd() + value
+ else:
+ pfamdbpath = os.getcwd() + "\\" + value
+ else:
+ pfamdbpath = os.getcwd() + "\\" + value
+ if sys.platform == ('linux2'):
+ value = options[options.index(i) + 1]
+ if "\\" in value or "/" in value:
+ value = value.replace("\\","/")
+ if value[0] == "/":
+ pfamdbpath = value
+ else:
+ pfamdbpath = os.getcwd() + "/" + value
+ else:
+ pfamdbpath = os.getcwd() + "/" + value
+ elif i == "--geneclustertypes":
+ if "," not in value and value not in ["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19"]:
+ invalidoptions(i)
+ else:
+ types = value.split(",")
+ types2 = []
+ if "1" in types:
+ types2 = [1]
+ for j in types:
+ if int(j) not in range(1,20):
+ invalidoptions(i)
+ else:
+ types2.append(int(j))
+ geneclustertypes = types2
+ elif i == "--help":
+ print inputinstructions
+ sys.exit()
+ elif i == "--gui":
+ pass
+ else:
+ invalidoptions(i)
+
+ #Determine number of CPUs used
+ if cores == "all":
+ try:
+ nrcpus = multiprocessing.cpu_count()
+ except(IOError,OSError,NotImplementedError):
+ nrcpus = 1
+ else:
+ try:
+ nrcpus = multiprocessing.cpu_count()
+ except(IOError,OSError,NotImplementedError):
+ nrcpus = 1
+ if cores < nrcpus:
+ nrcpus = cores
+
+ #Create directory structure needed for file storage
+ try:
+ os.mkdir(genomename)
+ except(IOError,OSError):
+ pass
+ hmmoutputfolder = genomename + "/hmmoutput/"
+ try:
+ os.mkdir(hmmoutputfolder)
+ except(IOError,OSError):
+ pass
+ nrpspksoutputfolder = genomename + "/nrpspks/"
+ try:
+ os.mkdir(nrpspksoutputfolder)
+ except(IOError,OSError):
+ pass
+ nrpspredictoroutputfolder = nrpspksoutputfolder + "nrpspredictor/"
+ try:
+ os.mkdir(nrpspredictoroutputfolder)
+ except(IOError,OSError):
+ pass
+ minowanrpsoutputfolder = nrpspksoutputfolder + "minowanrpspred/"
+ try:
+ os.mkdir(minowanrpsoutputfolder)
+ except(IOError,OSError):
+ pass
+ minowapksoutputfolder = nrpspksoutputfolder + "minowapkspred/"
+ try:
+ os.mkdir(minowapksoutputfolder)
+ except(IOError,OSError):
+ pass
+ minowacaloutputfolder = nrpspksoutputfolder + "minowacalpred/"
+ try:
+ os.mkdir(minowacaloutputfolder)
+ except(IOError,OSError):
+ pass
+ pkssignatureoutputfolder = nrpspksoutputfolder + "pkssignatures/"
+ try:
+ os.mkdir(pkssignatureoutputfolder)
+ except(IOError,OSError):
+ pass
+ kranalysisoutputfolder = nrpspksoutputfolder + "kr_analysis/"
+ try:
+ os.mkdir(kranalysisoutputfolder)
+ except(IOError,OSError):
+ pass
+ clusterblastoutputfolder = genomename + "/clusterblast/"
+ try:
+ os.mkdir(clusterblastoutputfolder)
+ except(IOError,OSError):
+ pass
+ smcogsoutputfolder = genomename + "/smcogs/"
+ try:
+ os.mkdir(smcogsoutputfolder)
+ except(IOError,OSError):
+ pass
+ substrspecsfolder = genomename + "/substrspecs/"
+ try:
+ os.mkdir(substrspecsfolder)
+ except(IOError,OSError):
+ pass
+ structuresfolder = genomename + "/structures/"
+ try:
+ os.mkdir(structuresfolder)
+ except(IOError,OSError):
+ pass
+ svgfolder = genomename + "/svg/"
+ try:
+ os.mkdir(svgfolder)
+ except(IOError,OSError):
+ pass
+ searchgtrfolder = genomename + "/searchgtr/"
+ try:
+ os.mkdir(searchgtrfolder)
+ except(IOError,OSError):
+ pass
+ htmlfolder = genomename + "/html/"
+ try:
+ os.mkdir(htmlfolder)
+ except(IOError,OSError):
+ pass
+ imagesfolder = genomename + "/images/"
+ try:
+ os.mkdir(imagesfolder)
+ except(IOError,OSError):
+ pass
+
+ #If input is unannotated GBK/EMBL file, convert to FASTA and use that as input
+ if " CDS " not in open(infile,"r").read() and "FT CDS " not in open(infile,"r").read():
+ if infile.split(".")[-1] == "embl" or infile.split(".")[-1] == "EMBL" or infile.split(".")[-1] == "emb" or infile.split(".")[-1] == "EMB":
+ filetext = open(infile,"r").read()
+ if "\nSQ" not in filetext:
+ print >> sys.stderr, "Exit: EMBL file not properly formatted, no sequence found."
+ logfile = open("antismash.log","w")
+ logfile.write("Exit: EMBL file not properly formatted, no sequence found.\n")
+ logfile.close()
+ sys.exit(1)
+ dnaseq = filetext.split("\nSQ")[1]
+ dnaseq = cleandnaseq(dnaseq)
+ sequence = dnaseq
+ if (sequence.count('A') + sequence.count('a') + sequence.count('C') + sequence.count('c') + sequence.count('G') + sequence.count('g') + sequence.count('T') + sequence.count('t')) < (0.5 * len(sequence)):
+ print >> sys.stderr, "Protein EMBL file provided. Please provide nucleotide EMBL file."
+ sys.exit(1)
+ fastafile = open(infile.rpartition(".")[0] + ".fasta","w")
+ fastafile.write(">" + infile.rpartition(".")[0] + "|\n")
+ fastafile.write(sequence)
+ fastafile.close()
+ infile = fastafile
+ elif infile.split(".")[-1] == "gbk" or infile.split(".")[-1] == "GBK" or infile.split(".")[-1] == "gb" or infile.split(".")[-1] == "GB" or infile.split(".")[-1] == "genbank" or infile.split(".")[-1] == "GENBANK":
+ filetext = open(infile,"r").read()
+ if "\nORIGIN" not in filetext:
+ print >> sys.stderr, "Exit: GBK file not properly formatted, no sequence found."
+ logfile = open("antismash.log","w")
+ logfile.write("Exit: GBK file not properly formatted, no sequence found.\n")
+ logfile.close()
+ sys.exit(1)
+ dnaseq = filetext.split("\nORIGIN")[1]
+ dnaseq = cleandnaseq(dnaseq)
+ sequence = dnaseq
+ if (sequence.count('A') + sequence.count('a') + sequence.count('C') + sequence.count('c') + sequence.count('G') + sequence.count('g') + sequence.count('T') + sequence.count('t')) < (0.5 * len(sequence)):
+ print >> sys.stderr, "Protein GBK file provided. Please provide nucleotide GBK file."
+ sys.exit(1)
+ fastafile = open(infile.rpartition(".")[0] + ".fasta","w")
+ fastafile.write(">" + infile.rpartition(".")[0] + "\n")
+ fastafile.write(sequence)
+ fastafile.close()
+ infile = infile.rpartition(".")[0] + ".fasta"
+ #If input is unannotated fasta file, predict genes with Glimmer and create EMBL file. If input is EMBL or GBK file, read input embl/gbk and create input fasta file, read input protein info into memory
+ annotated = "y"
+ if infile.split(".")[-1] == "fasta" or infile.split(".")[-1] == "FASTA" or infile.split(".")[-1] == "FAS" or infile.split(".")[-1] == "fas" or infile.split(".")[-1] == "FA" or infile.split(".")[-1] == "fa":
+ annotated = "n"
+ #Check input file formatting
+ sequence = get_sequence(infile)
+ if (sequence.count('A') + sequence.count('a') + sequence.count('C') + sequence.count('c') + sequence.count('G') + sequence.count('g') + sequence.count('T') + sequence.count('t')) < (0.5 * len(sequence)):
+ print >> sys.stderr, "Protein FASTA file provided. Please provide nucleotide FASTA file."
+ sys.exit(1)
+ nucleotides = ["A","a","C","c","G","g","T","t","N","n"]
+ badsequence = "n"
+ sequence_name = open(infile,"r").read().partition(">")[2].partition("\n")[0]
+ for i in sequence:
+ if i not in nucleotides:
+ badsequence = "y"
+ if badsequence == "y":
+ cleaned_sequence = cleandnaseq(sequence)
+ badsequence = "n"
+ for i in cleaned_sequence:
+ if i not in nucleotides:
+ badsequence = "y"
+ if badsequence == "n":
+ writefasta([sequence_name],[cleaned_sequence],infile.rpartition(".")[0] + "_f.fasta")
+ infile = infile.rpartition(".")[0] + "_f.fasta"
+ else:
+ print >>sys.stderr, "Incorrect file formatting. Please submit a properly formatted single-sequence FASTA file."
+ logfile = open("antismash.log","w")
+ logfile.write("Incorrect file formatting. Please submit a properly formatted single-sequence FASTA file.\n")
+ logfile.close()
+ sys.exit(1)
+ revseq = reverse_complement(sequence)
+ seqlength = len(sequence)
+
+ #Print Glimmer notification
+ #if taxon == "p":
+ # print "Running Glimmer 3.02 to predict genes in unannotated prokaryotic genome..."
+ #elif taxon == "e":
+ # print "Running GlimmerHMM 3.0.1 to predict genes in unannotated eukaryotic genome..."
+ logfile = open("antismash.log","w")
+ if taxon == "p":
+ logfile.write("Running Glimmer 3.02 to predict genes in unannotated prokaryotic genome...\n")
+ elif taxon == "e":
+ logfile.write("Running GlimmerHMM 3.0.1 to predict genes in unannotated eukaryotic genome...\n")
+ #logfile.close()
+ loginfo = open("antismash.log","r").read()
+ #logfile.close()
+ #Copying file and changing to folder to prepare for Glimmer3 prediction
+ os.mkdir( os.path.join(os.getcwd(), genomename, "geneprediction"))
+ if sys.platform == ('win32'):
+ os.system("copy/y " + infile + " geneprediction > nul")
+ if sys.platform == ('linux2'):
+ os.system("cp " + infile + " geneprediction > /dev/null")
+
+ os.chdir( os.path.join(os.getcwd(), genomename, "geneprediction"))
+ fastafile = '../../'+infile
+
+ #Find DNA sequence length
+ seq = get_sequence(fastafile)
+ dnaseqlength = len(seq)
+ #Run Glimmer for prokaryotic sequences, GlimmerHMM for eukaryotic sequences
+ if taxon == "p":
+ """
+ GlimmerPrediction, not needed since we can predict it in galaxy on our own
+ if genomeconf == "l":
+ if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
+ os.popen("tigr-glimmer long-orfs -l -n -t 1.15 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs")
+ else:
+ os.system("tigr-glimmer long-orfs -l -n -t 1.15 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs")
+ else:
+ if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
+ os.popen("tigr-glimmer long-orfs -n -t 1.15 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs")
+ else:
+ os.system("tigr-glimmer long-orfs -n -t 1.15 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs")
+ if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
+ os.popen("tigr-glimmer extract -t " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs > " + fastafile.rpartition(".")[0] + ".train")
+ else:
+ os.system("tigr-glimmer extract -t " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs > " + fastafile.rpartition(".")[0] + ".train")
+ if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
+ os.popen("tigr-glimmer build-icm -r " + fastafile.rpartition(".")[0] + ".icm < " + fastafile.rpartition(".")[0] + ".train")
+ else:
+ os.system("tigr-glimmer build-icm -r " + fastafile.rpartition(".")[0] + ".icm < " + fastafile.rpartition(".")[0] + ".train")
+ if genomeconf == "l":
+ if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
+ os.popen("tigr-glimmer glimmer3 -l -o50 -g" + minglength + " -q3000 -t30 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".icm " + fastafile.rpartition(".")[0])
+ else:
+ os.system("tigr-glimmer glimmer3 -l -o50 -g" + minglength + " -q3000 -t30 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".icm " + fastafile.rpartition(".")[0])
+ else:
+ if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
+ os.popen("tigr-glimmer glimmer3 -o50 -g" + minglength + " -q3000 -t30 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".icm " + fastafile.rpartition(".")[0])
+ else:
+ os.system("tigr-glimmer glimmer3 -o50 -g" + minglength + " -q3000 -t30 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".icm " + fastafile.rpartition(".")[0])
+ #Convert glimmer predictions into EMBL with sequence
+ glfile = fastafile.rpartition(".")[0] + ".predict"
+
+ Ende der Glimmer-Prediction
+ """
+ glfile = glimmer_prediction_path
+ emblfile = fastafile.rpartition(".")[0] + ".embl"
+ try:
+ file = open(glfile,"r")
+ filetext = file.read()
+ except:
+ print >> sys.stderr, "Glimmer gene prediction failed. Please check the format of your input FASTA file. Error 11."
+ logfile = open("antismash.log","w")
+ logfile.write("Glimmer gene prediction failed. Please check the format of your input FASTA file. Error 11.\n")
+ logfile.close()
+ sys.exit(1)
+ if "orf" not in filetext:
+ print >> sys.stderr, "Glimmer gene prediction failed: no genes found."
+ logfile = open("antismash.log","w")
+ logfile.write("Glimmer gene prediction failed: no genes found.\n")
+ logfile.close()
+ sys.exit(1)
+ filetext = filetext.replace("\r","\n")
+ lines = filetext.split("\n")
+ lines = lines[1:-1]
+ orfnames = []
+ starts = []
+ ends = []
+ strands = []
+ starts2 = []
+ ends2 = []
+ firstline = "y"
+ for i in lines:
+ columns = i.split(" ")
+ columns2 = []
+ for i in columns:
+ if i != "":
+ columns2.append(i)
+ columns = columns2
+ if len(columns) > 3:
+ frame = columns[3][0]
+ strands.append(frame)
+ else:
+ frame = ""
+ if firstline == "y" and frame == "+" and len(columns) > 3:
+ orfname = str(columns[0])
+ orfnames.append(orfname)
+ if genomeconf == "c" and (int(columns[1]) > int(columns[2])) and (int(columns[1]) > (0.5 * dnaseqlength)):
+ gstart = (int(columns[2]) % 3) + 1
+ if gstart == 3:
+ gstart = 0
+ starts.append(str(gstart))
+ ends.append(columns[2])
+ starts.append(columns[1])
+ ends.append(str(dnaseqlength))
+ else:
+ starts.append(columns[1])
+ ends.append(columns[2])
+ firstline = "n"
+ elif firstline == "y" and frame == "-" and len(columns) > 3:
+ orfname = str(columns[0])
+ orfnames.append(orfname)
+ if genomeconf == "c" and (int(columns[1]) > int(columns[2])) and (int(columns[1]) > (0.5 * dnaseqlength)):
+ gstart = (int(columns[2]) % 3) + 1
+ if gstart == 3:
+ gstart = 0
+ starts.append("complement(" + str(gstart))
+ ends.append(columns[2] + ")")
+ starts.append("complement(" + columns[1])
+ ends.append(str(dnaseqlength) + ")")
+ else:
+ complstart = "complement(" + str(columns[1])
+ starts.append(complstart)
+ complend = str(columns[2]) + ")"
+ ends.append(str(complend))
+ firstline = "n"
+ elif frame == "+" and len(columns) > 3:
+ orfname = str(columns[0])
+ orfnames.append(orfname)
+ starts.append(columns[1])
+ ends.append(columns[2])
+ elif frame == "-" and len(columns) > 3:
+ orfname = str(columns[0])
+ orfnames.append(orfname)
+ complstart = "complement(" + str(columns[1])
+ starts.append(complstart)
+ complend = str(columns[2]) + ")"
+ ends.append(str(complend))
+ if len(orfnames) == 0:
+ print >> sys.stderr, "Glimmer gene prediction failed. Please check the format of your input FASTA file. Error 10."
+ logfile = open("antismash.log","w")
+ logfile.write("Glimmer gene prediction failed. Please check the format of your input FASTA file. Error 10.\n")
+ logfile.close()
+ sys.exit(1)
+ out_file = open(emblfile,"w")
+ a = 0
+ #print "Writing EMBL file with Glimmer-predicted genes..."
+ logfile = open("antismash.log","w")
+ logfile.write(loginfo)
+ logfile.write("Writing EMBL file with Glimmer-predicted genes...\n")
+ #logfile.close()
+ loginfo = open("antismash.log","r").read()
+ #logfile.close()
+ if taxon == "p":
+ out_file.write("ID A01; SV 1; linear; DNA; STD; PRO; " + str(dnaseqlength) + " BP.\nXX\n")
+ else:
+ out_file.write("ID A01; SV 1; linear; DNA; STD; FUN; " + str(dnaseqlength) + " BP.\nXX\n")
+ out_file.write("AC A01;\nXX\n")
+ out_file.write("DE " + genomename + ";\nXX\n")
+ out_file.write("KW none;\nXX\n")
+ out_file.write("OS unknown;\n")
+ if taxon == "p":
+ out_file.write("OC Eubacteria;\nXX\n")
+ else:
+ out_file.write("OC Fungi;\nXX\n")
+ out_file.write("RN [1]\n")
+ out_file.write("RT ;\n")
+ out_file.write("RL Unknown.\nXX\n")
+ out_file.write("FH Key Location/Qualifiers\nFH\n")
+ out_file.write("FT source 1.." + str(dnaseqlength) + "\n")
+ for i in orfnames:
+ out_file.write("FT gene ")
+ out_file.write(starts[a])
+ out_file.write("..")
+ out_file.write(ends[a])
+ out_file.write("\n")
+ out_file.write('FT /gene="' + i + '"\n')
+ out_file.write("FT CDS ")
+ out_file.write(starts[a])
+ out_file.write("..")
+ out_file.write(ends[a])
+ out_file.write("\n")
+ out_file.write('FT /gene="' + i + '"\n')
+ a += 1
+ elif taxon == "e":
+ """
+ GlimmerHMM is executed extern ... in galaxy and will be provided through glimmer_prediction_path
+
+ if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
+ os.popen("glimmerhmm " + fastafile + " train_crypto -o " + fastafile.rpartition(".")[0] + ".predict -g")
+ else:
+ os.system("glimmerhmm " + fastafile + " train_crypto -o " + fastafile.rpartition(".")[0] + ".predict -g")
+ """
+ #Convert glimmerhmm predictions into EMBL with sequence
+ #glfile = fastafile.rpartition(".")[0] + ".predict"
+ glfile = glimmer_prediction_path
+ emblfile = fastafile.rpartition(".")[0] + ".embl"
+ try:
+ file = open(glfile,"r")
+ filetext = file.read().replace("\r","")
+ except:
+ print >> sys.stderr, "GlimmerHMM gene prediction failed. Please check the format of your input FASTA file. Error 9."
+ logfile = open("antismash.log","w")
+ logfile.write("GlimmerHMM gene prediction failed. Please check the format of your input FASTA file. Error 9.\n")
+ logfile.close()
+ sys.exit(1)
+ if "CDS" not in filetext:
+ print >> sys.stderr, "GlimmerHMM gene prediction failed: no genes found."
+ logfile = open("antismash.log","w")
+ logfile.write("GlimmerHMM gene prediction failed: no genes found.\n")
+ logfile.close()
+ sys.exit(1)
+ filetext = filetext.replace("\r","\n")
+ lines = filetext.split("\n")
+ lines = lines[2:-1]
+ orfnames = []
+ positions = []
+ firstline = "y"
+ x = 0
+ orfnr = 0
+ starts = []
+ ends = []
+ for i in lines:
+ columns = i.split("\t")
+ if len(columns) > 1:
+ if x == 0:
+ strand = columns[6]
+ if "mRNA" not in i:
+ starts.append(columns[3])
+ ends.append(columns[4])
+ elif x == (len(lines) - 1) or "mRNA" in lines[x + 1]:
+ strand = columns[6]
+ starts.append(columns[3])
+ ends.append(columns[4])
+ orfnr += 1
+ if len(str(orfnr)) == 1:
+ orfname = "orf0000" + str(orfnr)
+ elif len(str(orfnr)) == 2:
+ orfname = "orf000" + str(orfnr)
+ elif len(str(orfnr)) == 3:
+ orfname = "orf00" + str(orfnr)
+ elif len(str(orfnr)) == 4:
+ orfname = "orf0" + str(orfnr)
+ elif len(str(orfnr)) == 5:
+ orfname = "orf" + str(orfnr)
+ orfnames.append(orfname)
+ if strand == "+":
+ if len(starts) == 1:
+ pos = starts[0] + ".." + ends[0]
+ positions.append(pos)
+ else:
+ pos = "join("
+ y = 0
+ for i in starts:
+ pos = pos + i + ".." + ends[y]
+ if i != starts[-1]:
+ pos = pos + ","
+ y += 1
+ pos = pos + ")"
+ positions.append(pos)
+ elif strand == "-":
+ if len(starts) == 1:
+ pos = "complement(" + starts[0] + ".." + ends[0] + ")"
+ positions.append(pos)
+ else:
+ pos = "complement(join("
+ y = 0
+ for i in starts:
+ pos = pos + i + ".." + ends[y]
+ if i != starts[-1]:
+ pos = pos + ","
+ y += 1
+ pos = pos + "))"
+ positions.append(pos)
+ starts = []
+ ends = []
+ elif "mRNA" not in i:
+ starts.append(columns[3])
+ ends.append(columns[4])
+ x += 1
+ if len(orfnames) == 0:
+ print >> sys.stderr, "GlimmerHMM gene prediction failed. Please check the format of your input FASTA file. Error: 12"
+ logfile = open("antismash.log","w")
+ logfile.write("GlimmerHMM gene prediction failed. Please check the format of your input FASTA file. Error 12\n")
+ logfile.close()
+ sys.exit(1)
+ out_file = open(emblfile,"w")
+ a = 0
+ #print "Writing EMBL file with GlimmerHMM-predicted genes..."
+ logfile = open("antismash.log","w")
+ logfile.write(loginfo)
+ logfile.write("Writing EMBL file with GlimmerHMM-predicted genes...\n")
+ #logfile.close()
+ loginfo = open("antismash.log","r").read()
+ #logfile.close()
+ out_file.write("ID A01; SV 1; linear; DNA; STD; FUN; " + str(dnaseqlength) + " BP.\nXX\n")
+ out_file.write("AC A01;\nXX\n")
+ out_file.write("DE " + genomename + ";\nXX\n")
+ out_file.write("KW none;\nXX\n")
+ out_file.write("OS unknown;\n")
+ out_file.write("OC Fungi;\nXX\n")
+ out_file.write("RN [1]\n")
+ out_file.write("RT ;\n")
+ out_file.write("RL Unknown.\nXX\n")
+ out_file.write("FH Key Location/Qualifiers\nFH\n")
+ out_file.write("FT source 1.." + str(dnaseqlength) + "\n")
+ for i in orfnames:
+ out_file.write("FT gene ")
+ out_file.write(positions[a])
+ out_file.write("\n")
+ out_file.write('FT /gene="' + i + '"\n')
+ out_file.write("FT CDS ")
+ out_file.write(positions[a])
+ out_file.write("\n")
+ out_file.write('FT /gene="' + i + '"\n')
+ a += 1
+ out_file.write("XX\nSQ Sequence " + str(dnaseqlength) + " BP; " + str(seq.count("a") + seq.count("A")) + " A; " + str(seq.count("c") + seq.count("C")) + " C; " + str(seq.count("g") + seq.count("G")) + " G; " + str(seq.count("t") + seq.count("T")) + " T; " + str(dnaseqlength - (seq.count("a") + seq.count("A") + seq.count("c") + seq.count("C") + seq.count("g") + seq.count("G") + seq.count("t") + seq.count("T"))) + " other;\n")
+ seq2 = seq
+ out_file.write(" ")
+ grouplen=10
+ textlen = len(seq)
+ end = textlen - (textlen % grouplen)
+ repeated_iterator = [iter(itertools.islice(seq, 0, end))] * grouplen
+ parts = list(itertools.imap(lambda *chars: ''.join(chars),*repeated_iterator))
+ if dnaseqlength%grouplen != 0:
+ parts.append(seq[-1 * (dnaseqlength%grouplen):])
+ w = 1
+ for l in parts:
+ out_file.write(l + " ")
+ if w == len(parts):
+ if w%6 == 0 and dnaseqlength%60 != 0:
+ out_file.write((" " * (10 - dnaseqlength%grouplen) + " " * (10 - len(str(dnaseqlength)))) + str(dnaseqlength) + "\n//")
+ elif dnaseqlength%60 == 0:
+ out_file.write((" " * (10 - len(str(dnaseqlength)))) + str(dnaseqlength) + "\n//")
+ elif w%6 == 5 and dnaseqlength%grouplen == 0:
+ out_file.write((" " + " " * (10 - len(str(dnaseqlength)))) + str(dnaseqlength) + "\n//")
+ elif dnaseqlength%grouplen != 0:
+ out_file.write(" " * (10 - dnaseqlength%grouplen) + " " * (6 - len(parts)%6) + " " * (6 - len(parts)%6) + (" " * (10 - len(str(dnaseqlength)))) + str(dnaseqlength) + "\n//")
+ else:
+ out_file.write(" " * (6 - len(parts)%6) + " " * (5 - len(parts)%6) + (" " * (10 - len(str(dnaseqlength)))) + str(dnaseqlength) + "\n//")
+ elif w%6 == 0:
+ out_file.write((" " * (10 - len(str(w * 10)))) + str(w * 10) + "\n ")
+ w += 1
+ out_file.close()
+ os.chdir("../../")
+ infile = emblfile[6:]
+ emblfile = emblfile[6:]
+ if taxon == "p":
+ glimmeroutputfolder = genomename + "/glimmer/"
+ elif taxon == "e":
+ glimmeroutputfolder = genomename + "/glimmerhmm/"
+ try:
+ os.mkdir(glimmeroutputfolder)
+ except(IOError,OSError):
+ pass
+ proteins = embl2proteins(infile,sequence)
+ genomic_accnr = proteins[1]
+ dnaseqlength = proteins[2]
+ proteins = proteins[0]
+ writefasta(proteins[0],proteins[1],genomename + "/genome_proteins.fasta")
+ else:
+ #print "Reading embl/gbk file and creating input FASTA file for gene cluster detection..."
+ logfile.write("Reading embl/gbk file and creating input FASTA file for gene cluster detection...\n")
+ if infile.split(".")[-1] == "embl" or infile.split(".")[-1] == "EMBL" or infile.split(".")[-1] == "emb" or infile.split(".")[-1] == "EMB":
+ sequence = ""
+ proteins = embl2proteins(infile,sequence)
+ genomic_accnr = proteins[1]
+ dnaseqlength = proteins[2]
+ proteins = proteins[0]
+ writefasta(proteins[0],proteins[1],genomename + "/genome_proteins.fasta")
+ elif infile.split(".")[-1] == "gbk" or infile.split(".")[-1] == "GBK" or infile.split(".")[-1] == "gb" or infile.split(".")[-1] == "GB" or infile.split(".")[-1] == "genbank" or infile.split(".")[-1] == "GENBANK":
+ proteins = gbk2proteins(infile)
+ genomic_accnr = proteins[1]
+ dnaseqlength = proteins[2]
+ proteins = proteins[0]
+ writefasta(proteins[0],proteins[1],genomename + "/genome_proteins.fasta")
+ accessiondict = proteins[4]
+ seqdict = {}
+ fullnamedict = {}
+ strandsdict = {}
+ z = 0
+ for i in proteins[0]:
+ name = i.split("|")[4]
+ seq = proteins[1][z]
+ seqdict[name] = seq
+ strand = i.split("|")[3]
+ strandsdict[name] = strand
+ fullnamedict[name] = i
+ z += 1
+
+ elapsed = (time.time() - starttime)
+ #print "2968Time since start: " + str(elapsed)
+
+ #Run hmmsearch on proteins from input file and parse output
+ #print "Performing HMM search on proteins for detection of signature genes..."
+ logfile.write("Performing HMM search on proteins for detection of signature genes...\n")
+ hmmslist = ["AMP-binding.hmm","BLS.hmm","CAS.hmm","Chal_sti_synt_C.hmm","Chal_sti_synt_N.hmm","Condensation.hmm","ene_KS.hmm","hyb_KS.hmm","itr_KS.hmm","mod_KS.hmm","tra_KS.hmm","LANC_like.hmm","ATd.hmm","PKS_AT.hmm","PKS_KS.hmm","PP-binding.hmm","t2clf.hmm","t2ks.hmm","t2ks2.hmm","Terpene_synth.hmm","Terpene_synth_C.hmm","strH_like.hmm","neoL_like.hmm","DOIS.hmm","valA_like.hmm","spcFG_like.hmm","spcDK_like_cou.hmm","spcDK_like_glyc.hmm","strK_like1.hmm","strK_like2.hmm","bt1fas.hmm","ft1fas.hmm","t2fas.hmm","hglD.hmm","hglE.hmm","fabH.hmm","AfsA.hmm","IucA_IucC.hmm","ectoine_synt.hmm","phytoene_synt.hmm","Lant_dehyd_N.hmm","Lant_dehyd_C.hmm","Antimicrobial18.hmm","Gallidermin.hmm","L_biotic_typeA.hmm","LE-DUF.hmm","LE-LAC481.hmm","LE-LanBC.hmm","LE-MER+2PEP.hmm","MA-2PEPA.hmm","MA-DUF.hmm","MA-EPI.hmm","MA-LAC481.hmm","MA-NIS+EPI.hmm","MA-NIS.hmm","indsynth.hmm","A-OX.hmm","LmbU.hmm","MoeO5.hmm","LipM.hmm","LipU.hmm","LipV.hmm","ToyB.hmm","TunD.hmm","melC.hmm","strepbact.hmm","goadsporin_like.hmm","Antimicrobial14.hmm","Bacteriocin_IId.hmm","BacteriocIIc_cy.hmm","Bacteriocin_II.hmm","Lactococcin.hmm","Antimicrobial17.hmm","Lactococcin_972.hmm","Bacteriocin_IIc.hmm","LcnG-beta.hmm","Bacteriocin_IIi.hmm","Subtilosin_A.hmm","Cloacin.hmm","Neocarzinostat.hmm","Linocin_M18.hmm","TIGR03603.hmm","TIGR03604.hmm","TIGR03605.hmm","TIGR03731.hmm","TIGR03651.hmm","TIGR03678.hmm","TIGR03693.hmm","TIGR03798.hmm","TIGR03882.hmm","TIGR03601.hmm","TIGR03602.hmm","tabtoxin.hmm","cycdipepsynth.hmm","cyanobactin_synth.hmm","fom1.hmm","bcpB.hmm","frbD.hmm","mitE.hmm",'Lycopene_cycl.hmm','terpene_cyclase.hmm','NapT7.hmm','fung_ggpps.hmm','fung_ggpps2.hmm','dmat.hmm','trichodiene_synth.hmm','novK.hmm','novJ.hmm','novI.hmm','novH.hmm','pur6.hmm','pur10.hmm','nikJ.hmm','nikO.hmm','mvnA.hmm','thiostrepton.hmm','NAD_binding_4.hmm','vlmB.hmm','salQ.hmm','prnB.hmm']
+ for i in hmmslist:
+ hmmsearch = hmmsearch_path + " " + "--cpu " + str(nrcpus) + " -o " + genomename + "/hmmoutput/" + i.split(".")[0] + "_output.txt" + " --noali --tblout " + genomename + "/hmmoutput/" + i.split(".")[0] + ".txt " + hmms_path + i + " " + genomename + "/genome_proteins.fasta"
+ os.system(hmmsearch)
+ #print "Parsing HMM outputs..."
+ logfile.write("Parsing HMM outputs...\n")
+ detecteddomainsdict = {}
+ #Extract type I PKS proteins, KS cut-off: 50; AT cut-off: 20; exclude those sequences that score higher on type I FAS HMMs, type IV hglE-like KS domains
+ t1pksprots = []
+ transatpksprots = []
+ if 1 in geneclustertypes or 2 in geneclustertypes or 3 in geneclustertypes or 4 in geneclustertypes:
+ ks = parsehmmoutput(50,hmmoutputfolder + "PKS_KS.txt")
+ at = parsehmmoutput(50,hmmoutputfolder + "PKS_AT.txt")
+ ft1fasks = parsehmmoutput(50,hmmoutputfolder + "ft1fas.txt")
+ bt1fasks = parsehmmoutput(50,hmmoutputfolder + "bt1fas.txt")
+ hgleks = parsehmmoutput(50,hmmoutputfolder + "hglE.txt")
+ hgldks = parsehmmoutput(50,hmmoutputfolder + "hglD.txt")
+ fabhks = parsehmmoutput(50,hmmoutputfolder + "fabH.txt")
+ pksksprots = ks[0]
+ pksatprots = at[0]
+ pksatscores = at[1]
+ pksksscores = ks[1]
+ bt1fasprots = bt1fasks[0]
+ bt1fasscores = bt1fasks[1]
+ ft1fasprots = ft1fasks[0]
+ ft1fasscores = ft1fasks[1]
+ hgleprots = hgleks[0]
+ hglescores = hgleks[1]
+ hgldprots = hgldks[0]
+ hgldscores = hgldks[1]
+ fabhprots = fabhks[0]
+ fabhscores = fabhks[1]
+ for i in pksksprots:
+ exclude = "n"
+ score = pksksscores[pksksprots.index(i)]
+ if i in bt1fasprots:
+ bt1fasscore = bt1fasscores[bt1fasprots.index(i)]
+ if float(score) < float(bt1fasscore):
+ exclude = "y"
+ if i in ft1fasprots:
+ ft1fasscore = ft1fasscores[ft1fasprots.index(i)]
+ if float(score) < float(ft1fasscore):
+ exclude = "y"
+ if i in hgldprots:
+ hgldscore = hgldscores[hgldprots.index(i)]
+ if float(score) < float(hgldscore):
+ exclude = "y"
+ if i in hgleprots:
+ hglescore = hglescores[hgleprots.index(i)]
+ if float(score) < float(hglescore):
+ exclude = "y"
+ if i in fabhprots:
+ fabhscore = fabhscores[fabhprots.index(i)]
+ if float(score) < float(fabhscore):
+ exclude = "y"
+ if i in pksatprots and exclude == "n":
+ t1pksprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["PKS ketosynthase domain",pksksscores[pksksprots.index(i)]])
+ detdomlist.append(["PKS acyltransferase domain",pksatscores[pksatprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["PKS ketosynthase domain",pksksscores[pksksprots.index(i)]],["PKS acyltransferase domain",pksatscores[pksatprots.index(i)]]]
+ #Extract trans-AT PKSs: proteins with KS hits but without AT hits, and with trans-AT specific ATd-hits
+ atd = parsehmmoutput(65,hmmoutputfolder + "ATd.txt")
+ traks = parsehmmoutput(50,hmmoutputfolder + "tra_KS.txt")
+ traksprots = traks[0]
+ atdprots = atd[0]
+ atdscores = atd[1]
+ for i in pksksprots:
+ if i in atdprots and i in traksprots and i not in t1pksprots:
+ transatpksprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["PKS ketosynthase domain",pksksscores[pksksprots.index(i)]])
+ detdomlist.append(["Trans-AT PKS AT-docking domain",atdscores[atdprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["PKS ketosynthase domain",pksksscores[pksksprots.index(i)]],["Trans-AT PKS AT-docking domain",atdscores[atdprots.index(i)]]]
+ #Extract type II PKS & CLF proteins, KS-cut-off: 50, t2KS/clf score > modKS,eneKS,itrKS,traKS,t1fas,t2fas,hgle scores
+ t2pksprots = []
+ if 1 in geneclustertypes or 2 in geneclustertypes or 3 in geneclustertypes or 4 in geneclustertypes:
+ t2ks = parsehmmoutput(50,hmmoutputfolder + "t2ks.txt")
+ t2ks2 = parsehmmoutput(450,hmmoutputfolder + "t2ks2.txt")
+ t2clf = parsehmmoutput(50,hmmoutputfolder + "t2clf.txt")
+ eneks = parsehmmoutput(50,hmmoutputfolder + "ene_KS.txt")
+ hybks = parsehmmoutput(50,hmmoutputfolder + "hyb_KS.txt")
+ modks = parsehmmoutput(50,hmmoutputfolder + "mod_KS.txt")
+ itrks = parsehmmoutput(50,hmmoutputfolder + "itr_KS.txt")
+ traks = parsehmmoutput(50,hmmoutputfolder + "tra_KS.txt")
+ t2fasks = parsehmmoutput(50,hmmoutputfolder + "t2fas.txt")
+ ft1fasks = parsehmmoutput(50,hmmoutputfolder + "ft1fas.txt")
+ bt1fasks = parsehmmoutput(50,hmmoutputfolder + "bt1fas.txt")
+ hgleks = parsehmmoutput(50,hmmoutputfolder + "hglE.txt")
+ hgldks = parsehmmoutput(50,hmmoutputfolder + "hglD.txt")
+ fabhks = parsehmmoutput(50,hmmoutputfolder + "fabH.txt")
+ t2ksprots = t2ks[0]
+ t2ks2prots = t2ks2[0]
+ t2clfprots = t2clf[0]
+ eneksprots = eneks[0]
+ hybksprots = hybks[0]
+ modksprots = modks[0]
+ itrksprots = itrks[0]
+ traksprots = traks[0]
+ t2fasprots = t2fasks[0]
+ t2ksscores = t2ks[1]
+ t2ks2scores = t2ks2[1]
+ t2clfscores = t2clf[1]
+ eneksscores = eneks[1]
+ hybksscores = hybks[1]
+ modksscores = modks[1]
+ itrksscores = itrks[1]
+ traksscores = traks[1]
+ t2fasscores = t2fasks[1]
+ bt1fasprots = bt1fasks[0]
+ bt1fasscores = bt1fasks[1]
+ ft1fasprots = ft1fasks[0]
+ ft1fasscores = ft1fasks[1]
+ hgleprots = hgleks[0]
+ hglescores = hgleks[1]
+ hgldprots = hgldks[0]
+ hgldscores = hgldks[1]
+ fabhprots = fabhks[0]
+ fabhscores = fabhks[1]
+ for i in t2ksprots:
+ type2 = "y"
+ score = t2ksscores[t2ksprots.index(i)]
+ if i in eneksprots:
+ enescore = eneksscores[eneksprots.index(i)]
+ if float(enescore) > float(score):
+ type2 = "n"
+ if i in hybksprots:
+ hybscore = hybksscores[hybksprots.index(i)]
+ if float(hybscore) > float(score):
+ type2 = "n"
+ if i in modksprots:
+ modscore = modksscores[modksprots.index(i)]
+ if float(modscore) > float(score):
+ type2 = "n"
+ if i in itrksprots:
+ itrscore = itrksscores[itrksprots.index(i)]
+ if float(itrscore) > float(score):
+ type2 = "n"
+ if i in traksprots:
+ trascore = traksscores[traksprots.index(i)]
+ if float(trascore) > float(score):
+ type2 = "n"
+ if i in bt1fasprots:
+ bt1fasscore = bt1fasscores[bt1fasprots.index(i)]
+ if float(bt1fasscore) > float(score):
+ type2 = "n"
+ if i in ft1fasprots:
+ ft1fasscore = ft1fasscores[ft1fasprots.index(i)]
+ if float(ft1fasscore) > float(score):
+ type2 = "n"
+ if i in t2fasprots:
+ t2fasscore = t2fasscores[t2fasprots.index(i)]
+ if float(t2fasscore) > float(score):
+ type2 = "n"
+ if i in hgleprots:
+ hglescore = hglescores[hgleprots.index(i)]
+ if float(hglescore) > float(score):
+ type2 = "n"
+ if i in fabhprots:
+ fabhscore = fabhscores[fabhprots.index(i)]
+ if float(fabhscore) > float(score):
+ type2 = "n"
+ if type2 == "y" and i not in t2pksprots and i not in t1pksprots:
+ t2pksprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Type II ketosynthase",t2ksscores[t2ksprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Type II ketosynthase",t2ksscores[t2ksprots.index(i)]]]
+ for i in t2clfprots:
+ type2 = "y"
+ score = t2clfscores[t2clfprots.index(i)]
+ if i in eneksprots:
+ enescore = eneksscores[eneksprots.index(i)]
+ if float(enescore) > float(score):
+ type2 = "n"
+ if i in hybksprots:
+ hybscore = hybksscores[hybksprots.index(i)]
+ if float(hybscore) > float(score):
+ type2 = "n"
+ if i in modksprots:
+ modscore = modksscores[modksprots.index(i)]
+ if float(modscore) > float(score):
+ type2 = "n"
+ if i in itrksprots:
+ itrscore = itrksscores[itrksprots.index(i)]
+ if float(itrscore) > float(score):
+ type2 = "n"
+ if i in traksprots:
+ trascore = traksscores[traksprots.index(i)]
+ if float(trascore) > float(score):
+ type2 = "n"
+ if i in bt1fasprots:
+ bt1fasscore = bt1fasscores[bt1fasprots.index(i)]
+ if float(bt1fasscore) > float(score):
+ type2 = "n"
+ if i in ft1fasprots:
+ ft1fasscore = ft1fasscores[ft1fasprots.index(i)]
+ if float(ft1fasscore) > float(score):
+ type2 = "n"
+ if i in t2fasprots:
+ t2fasscore = t2fasscores[t2fasprots.index(i)]
+ if float(t2fasscore) > float(score):
+ type2 = "n"
+ if i in hgleprots:
+ hglescore = hglescores[hgleprots.index(i)]
+ if float(hglescore) > float(score):
+ type2 = "n"
+ if i in fabhprots:
+ fabhscore = fabhscores[fabhprots.index(i)]
+ if float(fabhscore) > float(score):
+ type2 = "n"
+ if type2 == "y" and i not in t2pksprots and i not in t1pksprots:
+ t2pksprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Chain length factor",t2clfscores[t2clfprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Chain length factor",t2clfscores[t2clfprots.index(i)]]]
+ for i in t2ks2prots:
+ type2 = "y"
+ score = t2ks2scores[t2ks2prots.index(i)]
+ if i in eneksprots:
+ enescore = eneksscores[eneksprots.index(i)]
+ if float(enescore) > float(score):
+ type2 = "n"
+ if i in hybksprots:
+ hybscore = hybksscores[hybksprots.index(i)]
+ if float(hybscore) > float(score):
+ type2 = "n"
+ if i in modksprots:
+ modscore = modksscores[modksprots.index(i)]
+ if float(modscore) > float(score):
+ type2 = "n"
+ if i in itrksprots:
+ itrscore = itrksscores[itrksprots.index(i)]
+ if float(itrscore) > float(score):
+ type2 = "n"
+ if i in traksprots:
+ trascore = traksscores[traksprots.index(i)]
+ if float(trascore) > float(score):
+ type2 = "n"
+ if i in bt1fasprots:
+ bt1fasscore = bt1fasscores[bt1fasprots.index(i)]
+ if float(bt1fasscore) > float(score):
+ type2 = "n"
+ if i in ft1fasprots:
+ ft1fasscore = ft1fasscores[ft1fasprots.index(i)]
+ if float(ft1fasscore) > float(score):
+ type2 = "n"
+ if i in t2fasprots:
+ t2fasscore = t2fasscores[t2fasprots.index(i)]
+ if float(t2fasscore) > float(score):
+ type2 = "n"
+ if i in hgleprots:
+ hglescore = hglescores[hgleprots.index(i)]
+ if float(hglescore) > float(score):
+ type2 = "n"
+ if i in fabhprots:
+ fabhscore = fabhscores[fabhprots.index(i)]
+ if float(fabhscore) > float(score):
+ type2 = "n"
+ if type2 == "y" and i not in t2pksprots and i not in t1pksprots:
+ t2pksprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Type II ketosynthase, model 2",t2ks2scores[t2ks2prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Type II ketosynthase, model 2",t2ks2scores[t2ks2prots.index(i)]]]
+ #Extract type III PKS proteins
+ t3pksprots = []
+ if 1 in geneclustertypes or 2 in geneclustertypes or 3 in geneclustertypes or 4 in geneclustertypes:
+ t3n = parsehmmoutput(63,hmmoutputfolder + "Chal_sti_synt_N.txt")
+ t3c = parsehmmoutput(35,hmmoutputfolder + "Chal_sti_synt_C.txt")
+ t3nprots = t3n[0]
+ t3nscores = t3n[1]
+ t3cprots = t3c[0]
+ t3cscores = t3c[1]
+ for i in t3cprots:
+ if i not in t3pksprots and i not in t1pksprots and i not in t2pksprots:
+ t3pksprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Chalcone/stilbene synthase,C-terminus",t3cscores[t3cprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Chalcone/stilbene synthase,C-terminus",t3cscores[t3cprots.index(i)]]]
+ for i in t3nprots:
+ if i not in t3pksprots and i not in t1pksprots and i not in t2pksprots:
+ t3pksprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Chalcone/stilbene synthase,N-terminus",t3nscores[t3nprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Chalcone/stilbene synthase,N-terminus",t3nscores[t3nprots.index(i)]]]
+ #Extract 'type IV' hglE-like PKS proteins, cut-off:50; only if not already scored as type 1-3 PKS, and not if FAS HMM has higher score
+ t4pksprots = []
+ if 1 in geneclustertypes or 2 in geneclustertypes or 3 in geneclustertypes or 4 in geneclustertypes:
+ t2fasks = parsehmmoutput(50,hmmoutputfolder + "t2fas.txt")
+ t2fasprots = t2fasks[0]
+ t2fasscores = t2fasks[1]
+ for i in hgleprots:
+ type4 = "y"
+ score = hglescores[hgleprots.index(i)]
+ if i in bt1fasprots:
+ bt1fasscore = bt1fasscores[bt1fasprots.index(i)]
+ if float(bt1fasscore) > float(score):
+ type4 = "n"
+ if i in ft1fasprots:
+ ft1fasscore = ft1fasscores[ft1fasprots.index(i)]
+ if float(ft1fasscore) > float(score):
+ type4 = "n"
+ if i in t2fasprots:
+ t2fasscore = t2fasscores[t2fasprots.index(i)]
+ if float(t2fasscore) > float(score):
+ type4 = "n"
+ if i in fabhprots:
+ fabhscore = fabhscores[fabhprots.index(i)]
+ if float(fabhscore) > float(score):
+ type4 = "n"
+ if i not in t1pksprots and i not in t2pksprots and i not in t3pksprots and i not in transatpksprots and type4 == "y":
+ t4pksprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Atypical PKS domain, HglE-like",hglescores[hgleprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Atypical PKS domain, HglE-like",hglescores[hgleprots.index(i)]]]
+ for i in hgldprots:
+ type4 = "y"
+ score = hgldscores[hgldprots.index(i)]
+ if i in bt1fasprots:
+ bt1fasscore = bt1fasscores[bt1fasprots.index(i)]
+ if float(bt1fasscore) > float(score):
+ type4 = "n"
+ if i in ft1fasprots:
+ ft1fasscore = ft1fasscores[ft1fasprots.index(i)]
+ if float(ft1fasscore) > float(score):
+ type4 = "n"
+ if i in t2fasprots:
+ t2fasscore = t2fasscores[t2fasprots.index(i)]
+ if float(t2fasscore) > float(score):
+ type4 = "n"
+ if i in fabhprots:
+ fabhscore = fabhscores[fabhprots.index(i)]
+ if float(fabhscore) > float(score):
+ type4 = "n"
+ if i not in t1pksprots and i not in t2pksprots and i not in t3pksprots and i not in transatpksprots and type4 == "y" and i not in t4pksprots:
+ t4pksprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Atypical PKS domain, HglD-like",hgldscores[hgldprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Atypical PKS domain, HglD-like",hgldscores[hgldprots.index(i)]]]
+ #Extract NRPS proteins, C cut-off: 20; A cut-off:20, both should be there, or single domain proteins C,A, or T should be within 20kb of each other or a full NRPS
+ nrpsprots = []
+ if 1 in geneclustertypes or 5 in geneclustertypes:
+ cond = parsehmmoutput(20,hmmoutputfolder + "Condensation.txt")
+ amp = parsehmmoutput(20,hmmoutputfolder + "AMP-binding.txt")
+ ampox = parsehmmoutput(50,hmmoutputfolder + "A-OX.txt")
+ ampoxprots = ampox[0]
+ ampoxscores = ampox[1]
+ for i in ampox[0]:
+ if i not in amp:
+ amp.append(i)
+ cprots = cond[0]
+ cscores = cond[1]
+ aprots = amp[0]
+ ascores = amp[1]
+ nrpsprots = []
+ for i in cprots:
+ if i in aprots:
+ nrpsprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Condensation domain",cscores[cprots.index(i)]])
+ if i in aprots:
+ detdomlist.append(["Adenylation domain",ascores[aprots.index(i)]])
+ elif i in ampoxprots:
+ detdomlist.append(["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in aprots:
+ detecteddomainsdict[i] = [["Condensation domain",cscores[cprots.index(i)]],["Adenylation domain",ascores[aprots.index(i)]]]
+ elif i in ampoxprots:
+ detecteddomainsdict[i] = [["Condensation domain",cscores[cprots.index(i)]],["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]]]
+ for i in t1pksprots:
+ if i in aprots:
+ nrpsprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in aprots:
+ detdomlist.append(["Adenylation domain",ascores[aprots.index(i)]])
+ elif i in ampoxprots:
+ detdomlist.append(["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in aprots:
+ detecteddomainsdict[i] = [["Adenylation domain",ascores[aprots.index(i)]]]
+ elif i in ampoxprots:
+ detecteddomainsdict[i] = [["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]]]
+ single_aprots = []
+ single_cprots = []
+ single_pptprots = []
+ pptprots = parsehmmoutput(20,hmmoutputfolder + "PP-binding.txt")[0]
+ for i in aprots:
+ if i not in nrpsprots:
+ single_aprots.append(i)
+ for i in cprots:
+ if i not in nrpsprots:
+ single_cprots.append(i)
+ for i in pptprots:
+ if i not in nrpsprots:
+ single_pptprots.append(i)
+ genelist = proteins[2]
+ genedict = proteins[3]
+ single_aprots_positions = {}
+ single_cprots_positions = {}
+ single_pptprots_positions = {}
+ nrpsprots_positions = {}
+ for j in single_aprots:
+ if j in genelist:
+ protstart_abs = min([int(genedict[j][0]),int(genedict[j][1])])
+ protend_abs = max([int(genedict[j][0]),int(genedict[j][1])])
+ single_aprots_positions[j] = int((protend_abs + protstart_abs) / 2)
+ for j in single_cprots:
+ if j in genelist:
+ protstart_abs = min([int(genedict[j][0]),int(genedict[j][1])])
+ protend_abs = max([int(genedict[j][0]),int(genedict[j][1])])
+ single_cprots_positions[j] = int((protend_abs + protstart_abs) / 2)
+ for j in single_pptprots:
+ if j in genelist:
+ protstart_abs = min([int(genedict[j][0]),int(genedict[j][1])])
+ protend_abs = max([int(genedict[j][0]),int(genedict[j][1])])
+ single_pptprots_positions[j] = int((protend_abs + protstart_abs) / 2)
+ for j in nrpsprots:
+ if j in genelist:
+ protstart_abs = min([int(genedict[j][0]),int(genedict[j][1])])
+ protend_abs = max([int(genedict[j][0]),int(genedict[j][1])])
+ nrpsprots_positions[j] = int((protend_abs + protstart_abs) / 2)
+ nrpsprots2 = []
+ for i in nrpsprots:
+ nrpsprots2.append(i)
+ for j in single_aprots:
+ include = "n"
+ pos = single_aprots_positions[j]
+ for i in single_cprots:
+ pos2 = single_cprots_positions[i]
+ if abs(pos - pos2) < 20000:
+ include = "y"
+ for i in nrpsprots2:
+ pos2 = nrpsprots_positions[i]
+ if abs(pos - pos2) < 20000:
+ include = "y"
+ if include == "y":
+ nrpsprots.append(j)
+ if detecteddomainsdict.has_key(j):
+ detdomlist = detecteddomainsdict[j]
+ if j in aprots:
+ detdomlist.append(["Adenylation domain",ascores[aprots.index(j)]])
+ elif j in ampoxprots:
+ detdomlist.append(["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(j)]])
+ detecteddomainsdict[j] = detdomlist
+ else:
+ if j in aprots:
+ detecteddomainsdict[j] = [["Adenylation domain",ascores[aprots.index(j)]]]
+ elif j in ampoxprots:
+ detecteddomainsdict[j] = [["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(j)]]]
+ for j in single_cprots:
+ include = "n"
+ pos = single_cprots_positions[j]
+ for i in single_aprots:
+ pos2 = single_aprots_positions[i]
+ if abs(pos - pos2) < 20000:
+ include = "y"
+ for i in nrpsprots2:
+ pos2 = nrpsprots_positions[i]
+ if abs(pos - pos2) < 20000:
+ include = "y"
+ if include == "y":
+ nrpsprots.append(j)
+ if detecteddomainsdict.has_key(j):
+ detdomlist = detecteddomainsdict[j]
+ detdomlist.append(["Condensation domain",cscores[cprots.index(j)]])
+ detecteddomainsdict[j] = detdomlist
+ else:
+ detecteddomainsdict[j] = [["Condensation domain",cscores[cprots.index(j)]]]
+ #Extract Terpene synthase proteins, various cut-offs
+ terpeneprots = []
+ if 1 in geneclustertypes or 6 in geneclustertypes:
+ terpene = parsehmmoutput(23,hmmoutputfolder + "Terpene_synth_C.txt")
+ terpeneprots = terpene[0]
+ terpenescores = terpene[1]
+ for i in terpeneprots:
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Terpene synthase, C-terminus",terpenescores[terpeneprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Terpene synthase, C-terminus",terpenescores[terpeneprots.index(i)]]]
+ if 1 in geneclustertypes or 6 in geneclustertypes:
+ physqualdata = parsehmmoutput(20,hmmoutputfolder + "phytoene_synt.txt")
+ physqualprots = physqualdata[0]
+ physqualscores = physqualdata[1]
+ for i in physqualprots:
+ if i not in terpeneprots:
+ terpeneprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Phytoene/squalene synthase",physqualscores[physqualprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Phytoene/squalene synthase",physqualscores[physqualprots.index(i)]]]
+ if 1 in geneclustertypes or 6 in geneclustertypes:
+ lycopenedata = parsehmmoutput(80,hmmoutputfolder + "Lycopene_cycl.txt")
+ lycopeneprots = lycopenedata[0]
+ lycopenescores = lycopenedata[1]
+ for i in lycopeneprots:
+ if i not in terpeneprots:
+ terpeneprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Lycopene cyclase",lycopenescores[lycopeneprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Lycopene cyclase",lycopenescores[lycopeneprots.index(i)]]]
+ if 1 in geneclustertypes or 6 in geneclustertypes:
+ terpene_cyclasesdata = parsehmmoutput(50,hmmoutputfolder + "terpene_cyclase.txt")
+ terpene_cyclases = terpene_cyclasesdata[0]
+ terpene_cyclases_scores = terpene_cyclasesdata[1]
+ for i in terpene_cyclases:
+ if i not in terpeneprots:
+ terpeneprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Terpene cyclase",terpene_cyclases_scores[terpene_cyclases.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Terpene cyclase",terpene_cyclases_scores[terpene_cyclases.index(i)]]]
+ if 1 in geneclustertypes or 6 in geneclustertypes:
+ NapT7 = parsehmmoutput(250,hmmoutputfolder + "NapT7.txt")
+ NapT7prots = NapT7[0]
+ NapT7scores = NapT7[1]
+ for i in NapT7prots:
+ if i not in terpeneprots:
+ terpeneprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["NapT7",NapT7scores[NapT7prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["NapT7",NapT7scores[NapT7prots.index(i)]]]
+ if 1 in geneclustertypes or 6 in geneclustertypes:
+ fung_ggpps = parsehmmoutput(420,hmmoutputfolder + "fung_ggpps.txt")
+ fung_ggppsprots = fung_ggpps[0]
+ fung_ggppsscores = fung_ggpps[1]
+ for i in fung_ggppsprots:
+ if i not in terpeneprots:
+ terpeneprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Fungal geranylgeranyl pyrophosphate synthase, model 1",fung_ggppsscores[fung_ggppsprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Fungal geranylgeranyl pyrophosphate synthase, model 1",fung_ggppsscores[fung_ggppsprots.index(i)]]]
+ if 1 in geneclustertypes or 6 in geneclustertypes:
+ fung_ggpps2 = parsehmmoutput(312,hmmoutputfolder + "fung_ggpps2.txt")
+ fung_ggpps2prots = fung_ggpps2[0]
+ fung_ggpps2scores = fung_ggpps2[1]
+ for i in fung_ggpps2prots:
+ if i not in terpeneprots:
+ terpeneprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Fungal geranylgeranyl pyrophosphate synthase, model 2",fung_ggpps2scores[fung_ggpps2prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Fungal geranylgeranyl pyrophosphate synthase, model 2",fung_ggpps2scores[fung_ggpps2prots.index(i)]]]
+ if 1 in geneclustertypes or 6 in geneclustertypes:
+ dmat = parsehmmoutput(200,hmmoutputfolder + "dmat.txt")
+ dmatprots = dmat[0]
+ dmatscores = dmat[1]
+ for i in dmatprots:
+ if i not in terpeneprots:
+ terpeneprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Dimethylallyl tryptophan synthase",dmatscores[dmatprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Dimethylallyl tryptophan synthase",dmatscores[dmatprots.index(i)]]]
+ if 1 in geneclustertypes or 6 in geneclustertypes:
+ trichodiene_synth = parsehmmoutput(150,hmmoutputfolder + "trichodiene_synth.txt")
+ trichodiene_synthprots = trichodiene_synth[0]
+ trichodiene_synthscores = trichodiene_synth[1]
+ for i in trichodiene_synthprots:
+ if i not in terpeneprots:
+ terpeneprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Trichodiene synthase",trichodiene_synthscores[trichodiene_synthprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Trichodiene synthase",trichodiene_synthscores[trichodiene_synthprots.index(i)]]]
+ #Extract lantibiotic proteins, LanC cut-off: 80, Lant_dehN & Lant_dehC combination cut-off: 20 each
+ lantprots = []
+ if 1 in geneclustertypes or 7 in geneclustertypes:
+ lantc = parsehmmoutput(80,hmmoutputfolder + "LANC_like.txt")
+ lancprots = lantc[0]
+ lancscores = lantc[1]
+ landehn = parsehmmoutput(20,hmmoutputfolder + "Lant_dehyd_N.txt")
+ landehnprots = landehn[0]
+ landehnscores = landehn[1]
+ landehc = parsehmmoutput(20,hmmoutputfolder + "Lant_dehyd_C.txt")
+ landehcprots = landehc[0]
+ landehcscores = landehc[1]
+ lanti1 = parsehmmoutput(20,hmmoutputfolder + "Antimicrobial18.txt")
+ lanti1prots = lanti1[0]
+ lanti1scores = lanti1[1]
+ lanti2 = parsehmmoutput(20,hmmoutputfolder + "Gallidermin.txt")
+ lanti2prots = lanti2[0]
+ lanti2scores = lanti2[1]
+ lanti3 = parsehmmoutput(20,hmmoutputfolder + "L_biotic_typeA.txt")
+ lanti3prots = lanti3[0]
+ lanti3scores = lanti3[1]
+ lanti4 = parsehmmoutput(20,hmmoutputfolder + "LE-DUF.txt")
+ lanti4prots = lanti4[0]
+ lanti4scores = lanti4[1]
+ lanti5 = parsehmmoutput(20,hmmoutputfolder + "LE-LAC481.txt")
+ lanti5prots = lanti5[0]
+ lanti5scores = lanti5[1]
+ lanti6 = parsehmmoutput(20,hmmoutputfolder + "LE-LanBC.txt")
+ lanti6prots = lanti6[0]
+ lanti6scores = lanti6[1]
+ lanti7 = parsehmmoutput(20,hmmoutputfolder + "LE-MER+2PEP.txt")
+ lanti7prots = lanti7[0]
+ lanti7scores = lanti7[1]
+ lanti8 = parsehmmoutput(20,hmmoutputfolder + "MA-2PEPA.txt")
+ lanti8prots = lanti8[0]
+ lanti8scores = lanti8[1]
+ lanti9 = parsehmmoutput(20,hmmoutputfolder + "MA-DUF.txt")
+ lanti9prots = lanti9[0]
+ lanti9scores = lanti9[1]
+ lanti10 = parsehmmoutput(20,hmmoutputfolder + "MA-EPI.txt")
+ lanti10prots = lanti10[0]
+ lanti10scores = lanti10[1]
+ lanti11 = parsehmmoutput(20,hmmoutputfolder + "MA-LAC481.txt")
+ lanti11prots = lanti11[0]
+ lanti11scores = lanti11[1]
+ lanti12 = parsehmmoutput(20,hmmoutputfolder + "MA-NIS+EPI.txt")
+ lanti12prots = lanti12[0]
+ lanti12scores = lanti12[1]
+ lanti13 = parsehmmoutput(20,hmmoutputfolder + "MA-NIS.txt")
+ lanti13prots = lanti13[0]
+ lanti13scores = lanti13[1]
+ lanti14 = parsehmmoutput(18,hmmoutputfolder + "TIGR03731.txt")
+ lanti14prots = lanti14[0]
+ lanti14scores = lanti14[1]
+ lantiprots = lanti1prots + lanti2prots + lanti3prots + lanti4prots + lanti5prots + lanti6prots + lanti7prots + lanti8prots + lanti9prots + lanti10prots + lanti11prots + lanti12prots + lanti13prots + lanti14prots
+ lantiprots2 = []
+ for i in lantiprots:
+ if i not in lantiprots2:
+ lantiprots2.append(i)
+ lantiprots = lantiprots2
+ for i in lancprots:
+ lantprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["LanC lanthionine synthase domain",lancscores[lancprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["LanC lanthionine synthase domain",lancscores[lancprots.index(i)]]]
+ for i in landehnprots:
+ if i in landehcprots and i not in lantprots:
+ lantprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Lantibiotic dehydratase, N-terminus",landehnscores[landehnprots.index(i)]])
+ detdomlist.append(["Lantibiotic dehydratase, C-terminus",landehcscores[landehcprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Lantibiotic dehydratase, N-terminus",landehnscores[landehnprots.index(i)]],["Lantibiotic dehydratase, C-terminus",landehcscores[landehcprots.index(i)]]]
+ for i in lantiprots:
+ if i not in lantprots:
+ lantprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in lanti1prots:
+ detdomlist.append(["Antimicrobial18 domain",lanti1scores[lanti1prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in lanti1prots:
+ detecteddomainsdict[i] = [["Antimicrobial18 domain",lanti1scores[lanti1prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in lanti2prots:
+ detdomlist.append(["Gallidermin domain",lanti2scores[lanti2prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in lanti2prots:
+ detecteddomainsdict[i] = [["Gallidermin domain",lanti2scores[lanti2prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in lanti3prots:
+ detdomlist.append(["L_biotic_typeA domain",lanti3scores[lanti3prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in lanti3prots:
+ detecteddomainsdict[i] = [["L_biotic_typeA domain",lanti3scores[lanti3prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in lanti4prots:
+ detdomlist.append(["LE-DUF domain",lanti4scores[lanti4prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in lanti4prots:
+ detecteddomainsdict[i] = [["LE-DUF domain",lanti4scores[lanti4prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in lanti5prots:
+ detdomlist.append(["LE-LAC481 domain",lanti5scores[lanti5prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in lanti5prots:
+ detecteddomainsdict[i] = [["LE-LAC481 domain",lanti5scores[lanti5prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in lanti6prots:
+ detdomlist.append(["LE-LanBC domain",lanti6scores[lanti6prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in lanti6prots:
+ detecteddomainsdict[i] = [["LE-LanBC domain",lanti6scores[lanti6prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in lanti7prots:
+ detdomlist.append(["LE-MER+2PEP domain",lanti7scores[lanti7prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in lanti7prots:
+ detecteddomainsdict[i] = [["LE-MER+2PEP domain",lanti7scores[lanti7prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in lanti8prots:
+ detdomlist.append(["MA-2PEPA domain",lanti8scores[lanti8prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in lanti8prots:
+ detecteddomainsdict[i] = [["MA-2PEPA domain",lanti8scores[lanti8prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in lanti9prots:
+ detdomlist.append(["MA-DUF domain",lanti9scores[lanti9prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in lanti9prots:
+ detecteddomainsdict[i] = [["MA-DUF domain",lanti9scores[lanti9prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in lanti10prots:
+ detdomlist.append(["MA-EPI domain",lanti10scores[lanti10prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in lanti10prots:
+ detecteddomainsdict[i] = [["MA-EPI domain",lanti10scores[lanti10prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in lanti11prots:
+ detdomlist.append(["MA-LAC481 domain",lanti11scores[lanti11prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in lanti11prots:
+ detecteddomainsdict[i] = [["MA-LAC481 domain",lanti11scores[lanti11prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in lanti12prots:
+ detdomlist.append(["MA-NIS+EPI domain",lanti12scores[lanti12prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in lanti12prots:
+ detecteddomainsdict[i] = [["MA-NIS+EPI domain",lanti12scores[lanti12prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in lanti13prots:
+ detdomlist.append(["MA-NIS domain",lanti13scores[lanti13prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in lanti13prots:
+ detecteddomainsdict[i] = [["MA-NIS domain",lanti13scores[lanti13prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in lanti14prots:
+ detdomlist.append(["TIGR03731: lantibiotic, gallidermin/nisin family",lanti14scores[lanti14prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in lanti14prots:
+ detecteddomainsdict[i] = [["TIGR03731: lantibiotic, gallidermin/nisin family",lanti14scores[lanti14prots.index(i)]]]
+ #Bacteriocin proteins, various cut-offs
+ bcinprots = []
+ if 1 in geneclustertypes or 8 in geneclustertypes:
+ bcin1prots = parsehmmoutput(50,hmmoutputfolder + "strepbact.txt")[0]
+ bcin2prots = parsehmmoutput(90,hmmoutputfolder + "Antimicrobial14.txt")[0]
+ bcin3prots = parsehmmoutput(23,hmmoutputfolder + "Bacteriocin_IId.txt")[0]
+ bcin4prots = parsehmmoutput(92,hmmoutputfolder + "BacteriocIIc_cy.txt")[0]
+ bcin5prots = parsehmmoutput(40,hmmoutputfolder + "Bacteriocin_II.txt")[0]
+ bcin6prots = parsehmmoutput(24,hmmoutputfolder + "Lactococcin.txt")[0]
+ bcin7prots = parsehmmoutput(31,hmmoutputfolder + "Antimicrobial17.txt")[0]
+ bcin8prots = parsehmmoutput(25,hmmoutputfolder + "Lactococcin_972.txt")[0]
+ bcin9prots = parsehmmoutput(27,hmmoutputfolder + "Bacteriocin_IIc.txt")[0]
+ bcin10prots = parsehmmoutput(78,hmmoutputfolder + "LcnG-beta.txt")[0]
+ bcin11prots = parsehmmoutput(56,hmmoutputfolder + "Bacteriocin_IIi.txt")[0]
+ bcin12prots = parsehmmoutput(98,hmmoutputfolder + "Subtilosin_A.txt")[0]
+ bcin13prots = parsehmmoutput(27,hmmoutputfolder + "Cloacin.txt")[0]
+ bcin14prots = parsehmmoutput(25,hmmoutputfolder + "Linocin_M18.txt")[0]
+ bcin15prots = parsehmmoutput(150,hmmoutputfolder + "TIGR03603.txt")[0]
+ bcin16prots = parsehmmoutput(440,hmmoutputfolder + "TIGR03604.txt")[0]
+ bcin17prots = parsehmmoutput(200,hmmoutputfolder + "TIGR03605.txt")[0]
+ bcin18prots = parsehmmoutput(18,hmmoutputfolder + "TIGR03651.txt")[0]
+ bcin19prots = parsehmmoutput(35,hmmoutputfolder + "TIGR03678.txt")[0]
+ bcin20prots = parsehmmoutput(400,hmmoutputfolder + "TIGR03693.txt")[0]
+ bcin21prots = parsehmmoutput(16,hmmoutputfolder + "TIGR03798.txt")[0]
+ bcin22prots = parsehmmoutput(150,hmmoutputfolder + "TIGR03882.txt")[0]
+ bcin23prots = parsehmmoutput(50,hmmoutputfolder + "TIGR03601.txt")[0]
+ bcin24prots = parsehmmoutput(50,hmmoutputfolder + "TIGR03602.txt")[0]
+ bcin25prots = parsehmmoutput(20,hmmoutputfolder + "mvnA.txt")[0]
+ bcin26prots = parsehmmoutput(20,hmmoutputfolder + "thiostrepton.txt")[0]
+ bcin1scores = parsehmmoutput(50,hmmoutputfolder + "strepbact.txt")[1]
+ bcin2scores = parsehmmoutput(90,hmmoutputfolder + "Antimicrobial14.txt")[1]
+ bcin3scores = parsehmmoutput(23,hmmoutputfolder + "Bacteriocin_IId.txt")[1]
+ bcin4scores = parsehmmoutput(92,hmmoutputfolder + "BacteriocIIc_cy.txt")[1]
+ bcin5scores = parsehmmoutput(40,hmmoutputfolder + "Bacteriocin_II.txt")[1]
+ bcin6scores = parsehmmoutput(24,hmmoutputfolder + "Lactococcin.txt")[1]
+ bcin7scores = parsehmmoutput(31,hmmoutputfolder + "Antimicrobial17.txt")[1]
+ bcin8scores = parsehmmoutput(25,hmmoutputfolder + "Lactococcin_972.txt")[1]
+ bcin9scores = parsehmmoutput(27,hmmoutputfolder + "Bacteriocin_IIc.txt")[1]
+ bcin10scores = parsehmmoutput(78,hmmoutputfolder + "LcnG-beta.txt")[1]
+ bcin11scores = parsehmmoutput(56,hmmoutputfolder + "Bacteriocin_IIi.txt")[1]
+ bcin12scores = parsehmmoutput(98,hmmoutputfolder + "Subtilosin_A.txt")[1]
+ bcin13scores = parsehmmoutput(27,hmmoutputfolder + "Cloacin.txt")[1]
+ bcin14scores = parsehmmoutput(25,hmmoutputfolder + "Linocin_M18.txt")[1]
+ bcin15scores = parsehmmoutput(150,hmmoutputfolder + "TIGR03603.txt")[1]
+ bcin16scores = parsehmmoutput(440,hmmoutputfolder + "TIGR03604.txt")[1]
+ bcin17scores = parsehmmoutput(200,hmmoutputfolder + "TIGR03605.txt")[1]
+ bcin18scores = parsehmmoutput(18,hmmoutputfolder + "TIGR03651.txt")[1]
+ bcin19scores = parsehmmoutput(35,hmmoutputfolder + "TIGR03678.txt")[1]
+ bcin20scores = parsehmmoutput(400,hmmoutputfolder + "TIGR03693.txt")[1]
+ bcin21scores = parsehmmoutput(16,hmmoutputfolder + "TIGR03798.txt")[1]
+ bcin22scores = parsehmmoutput(150,hmmoutputfolder + "TIGR03882.txt")[1]
+ bcin23scores = parsehmmoutput(50,hmmoutputfolder + "TIGR03601.txt")[1]
+ bcin24scores = parsehmmoutput(50,hmmoutputfolder + "TIGR03602.txt")[1]
+ bcin25scores = parsehmmoutput(20,hmmoutputfolder + "mvnA.txt")[1]
+ bcin26scores = parsehmmoutput(20,hmmoutputfolder + "thiostrepton.txt")[1]
+ bcinprots = bcin1prots + bcin2prots + bcin3prots + bcin4prots + bcin5prots + bcin6prots + bcin7prots + bcin8prots + bcin9prots + bcin10prots + bcin11prots + bcin12prots + bcin13prots + bcin14prots + bcin15prots + bcin16prots + bcin17prots + bcin18prots + bcin19prots + bcin20prots + bcin21prots + bcin22prots + bcin23prots + bcin24prots + bcin25prots + bcin26prots
+ bcinprots2 = []
+ for i in bcinprots:
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in bcin1prots:
+ detdomlist.append(["Putative Streptomyces bacteriocin",bcin1scores[bcin1prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in bcin1prots:
+ detecteddomainsdict[i] = [["Putative Streptomyces bacteriocin",bcin1scores[bcin1prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in bcin2prots:
+ detdomlist.append(["Antimicrobial14 domain",bcin2scores[bcin2prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in bcin2prots:
+ detecteddomainsdict[i] = [["Antimicrobial14 domain",bcin2scores[bcin2prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in bcin3prots:
+ detdomlist.append(["Bacteriocin_IId domain",bcin3scores[bcin3prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in bcin3prots:
+ detecteddomainsdict[i] = [["Bacteriocin_IId domain",bcin3scores[bcin3prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in bcin4prots:
+ detdomlist.append(["BacteriocIIc_cy domain",bcin4scores[bcin4prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in bcin4prots:
+ detecteddomainsdict[i] = [["BacteriocIIc_cy domain",bcin4scores[bcin4prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in bcin5prots:
+ detdomlist.append(["Bacteriocin_II domain",bcin5scores[bcin5prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in bcin5prots:
+ detecteddomainsdict[i] = [["Bacteriocin_II domain",bcin5scores[bcin5prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in bcin6prots:
+ detdomlist.append(["Lactococcin",bcin6scores[bcin6prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in bcin6prots:
+ detecteddomainsdict[i] = [["Lactococcin",bcin6scores[bcin6prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in bcin7prots:
+ detdomlist.append(["Antimicrobial17 domain",bcin7scores[bcin7prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in bcin7prots:
+ detecteddomainsdict[i] = [["Antimicrobial17 domain",bcin7scores[bcin7prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in bcin8prots:
+ detdomlist.append(["Lactococcin_972 domain",bcin8scores[bcin8prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in bcin8prots:
+ detecteddomainsdict[i] = [["Lactococcin_972 domain",bcin8scores[bcin8prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in bcin9prots:
+ detdomlist.append(["Bacteriocin_IIc domain",bcin9scores[bcin9prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in bcin9prots:
+ detecteddomainsdict[i] = [["Bacteriocin_IIc domain",bcin9scores[bcin9prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in bcin10prots:
+ detdomlist.append(["LcnG-beta domain",bcin10scores[bcin10prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in bcin10prots:
+ detecteddomainsdict[i] = [["LcnG-beta domain",bcin10scores[bcin10prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in bcin11prots:
+ detdomlist.append(["Bacteriocin_IIi domain",bcin11scores[bcin11prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in bcin11prots:
+ detecteddomainsdict[i] = [["Bacteriocin_IIi domain",bcin11scores[bcin11prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in bcin12prots:
+ detdomlist.append(["Subtilosin_A domain",bcin12scores[bcin12prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in bcin12prots:
+ detecteddomainsdict[i] = [["Subtilosin_A domain",bcin12scores[bcin12prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in bcin13prots:
+ detdomlist.append(["Cloacin domain",bcin13scores[bcin13prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in bcin13prots:
+ detecteddomainsdict[i] = [["Cloacin domain",bcin13scores[bcin13prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in bcin14prots:
+ detdomlist.append(["Linocin_M18 domain",bcin14scores[bcin14prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in bcin14prots:
+ detecteddomainsdict[i] = [["Linocin_M18 domain",bcin14scores[bcin14prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in bcin15prots:
+ detdomlist.append(["TIGR03603: bacteriocin biosynthesis cyclodehydratase",bcin15scores[bcin15prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in bcin15prots:
+ detecteddomainsdict[i] = [["TIGR03603: bacteriocin biosynthesis cyclodehydratase",bcin15scores[bcin15prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in bcin16prots:
+ detdomlist.append(["TGIR03604: bacteriocin biosynthesis docking scaffold",bcin16scores[bcin16prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in bcin16prots:
+ detecteddomainsdict[i] = [["TGIR03604: bacteriocin biosynthesis docking scaffold",bcin16scores[bcin16prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in bcin17prots:
+ detdomlist.append(["TGIR03605: SagB-type dehydrogenase",bcin17scores[bcin17prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in bcin17prots:
+ detecteddomainsdict[i] = [["TGIR03605: SagB-type dehydrogenase",bcin17scores[bcin17prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in bcin18prots:
+ detdomlist.append(["TIGR03651: bacteriocin, circularin A/uberolysin family",bcin18scores[bcin18prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in bcin18prots:
+ detecteddomainsdict[i] = [["TIGR03651: bacteriocin, circularin A/uberolysin family",bcin18scores[bcin18prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in bcin19prots:
+ detdomlist.append(["TIGR03678: bacteriocin, microcyclamide/patellamide family",bcin19scores[bcin19prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in bcin19prots:
+ detecteddomainsdict[i] = [["TIGR03678: bacteriocin, microcyclamide/patellamide family",bcin19scores[bcin19prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in bcin20prots:
+ detdomlist.append(["TIGR03693: thiazole-containing bacteriocin maturation protein",bcin20scores[bcin20prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in bcin20prots:
+ detecteddomainsdict[i] = [["TIGR03693: thiazole-containing bacteriocin maturation protein",bcin20scores[bcin20prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in bcin21prots:
+ detdomlist.append(["TIGR03798: bacteriocin propeptide",bcin21scores[bcin21prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in bcin21prots:
+ detecteddomainsdict[i] = [["TIGR03798: bacteriocin propeptide",bcin21scores[bcin21prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in bcin22prots:
+ detdomlist.append(["TIGR03882: bacteriocin biosynthesis cyclodehydratase",bcin22scores[bcin22prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in bcin22prots:
+ detecteddomainsdict[i] = [["TIGR03882: bacteriocin biosynthesis cyclodehydratase",bcin22scores[bcin22prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in bcin23prots:
+ detdomlist.append(["TIGR03601: bacteriocin, BA_2677 family",bcin23scores[bcin23prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in bcin23prots:
+ detecteddomainsdict[i] = [["TIGR03601: bacteriocin, BA_2677 family",bcin23scores[bcin23prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in bcin24prots:
+ detdomlist.append(["TIGR03602: bacteriocin protoxin, streptolysin S family",bcin24scores[bcin24prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in bcin24prots:
+ detecteddomainsdict[i] = [["TIGR03602: bacteriocin protoxin, streptolysin S family",bcin24scores[bcin24prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in bcin25prots:
+ detdomlist.append(["Bacteriocin, microviridin family",bcin25scores[bcin25prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in bcin25prots:
+ detecteddomainsdict[i] = [["Bacteriocin, microviridin family",bcin25scores[bcin25prots.index(i)]]]
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ if i in bcin26prots:
+ detdomlist.append(["Thiopeptide, thiostrepton-like",bcin26scores[bcin26prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in bcin26prots:
+ detecteddomainsdict[i] = [["Thiopeptide, thiostrepton-like",bcin26scores[bcin26prots.index(i)]]]
+ if i not in bcinprots2:
+ bcinprots2.append(i)
+ bcinprots = bcinprots2
+ #Extract beta-lactam synthetase proteins, cut-off: 250
+ lactamprots = []
+ if 1 in geneclustertypes or 9 in geneclustertypes:
+ bls = parsehmmoutput(250,hmmoutputfolder + "BLS.txt")
+ blsprots = bls[0]
+ blsscores = bls[1]
+ for i in bls[0]:
+ lactamprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Beta-lactam synthase",blsscores[blsprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Beta-lactam synthase",blsscores[blsprots.index(i)]]]
+ cas = parsehmmoutput(250,hmmoutputfolder + "CAS.txt")
+ casprots = cas[0]
+ casscores = cas[1]
+ for i in cas[0]:
+ if i not in lactamprots:
+ lactamprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Clavulanic acid synthase-like",casscores[casprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Clavulanic acid synthase-like",casscores[casprots.index(i)]]]
+ tabtoxin = parsehmmoutput(500,hmmoutputfolder + "tabtoxin.txt")
+ tabtoxinprots = tabtoxin[0]
+ tabtoxinscores = tabtoxin[1]
+ for i in tabtoxin[0]:
+ if i not in lactamprots:
+ lactamprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Tabtoxin synthase-like",tabtoxinscores[tabtoxinprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Tabtoxin synthase-like",tabtoxinscores[tabtoxinprots.index(i)]]]
+ #Extract aminoglycoside / aminocyclitol biosynthesis clusters, clusters taken from Flatt & Mahmud et al. 2007
+ amglyccyclprots = []
+ if 1 in geneclustertypes or 10 in geneclustertypes:
+ strH = parsehmmoutput(200,hmmoutputfolder + "strH_like.txt")
+ strhprots = strH[0]
+ strhscores = strH[1]
+ for i in strH[0]:
+ amglyccyclprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["StrH-like glycosyltransferase",strhscores[strhprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["StrH-like glycosyltransferase",strhscores[strhprots.index(i)]]]
+ strK1 = parsehmmoutput(800,hmmoutputfolder + "strK_like1.txt")
+ strk1prots = strK1[0]
+ strk1scores = strK1[1]
+ for i in strK1[0]:
+ amglyccyclprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["StrK-like phosphatase",strk1scores[strk1prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["StrK-like phosphatase",strk1scores[strk1prots.index(i)]]]
+ strK2 = parsehmmoutput(650,hmmoutputfolder + "strK_like2.txt")
+ strk2prots = strK2[0]
+ strk2scores = strK2[1]
+ for i in strK2[0]:
+ amglyccyclprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["StrK-like phosphatase, model 2",strk2scores[strk2prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["StrK-like phosphatase, model 2",strk2scores[strk2prots.index(i)]]]
+ neoL = parsehmmoutput(50,hmmoutputfolder + "neoL_like.txt")
+ neolprots = neoL[0]
+ neolscores = neoL[1]
+ for i in neoL[0]:
+ amglyccyclprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["NeoL-like deacetylase",neolscores[neolprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["NeoL-like deacetylase",neolscores[neolprots.index(i)]]]
+ DOIS = parsehmmoutput(500,hmmoutputfolder + "DOIS.txt")
+ doisprots = DOIS[0]
+ doisscores = DOIS[1]
+ for i in DOIS[0]:
+ amglyccyclprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["2-deoxy-scyllo-inosose synthase",doisscores[doisprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["2-deoxy-scyllo-inosose synthase",doisscores[doisprots.index(i)]]]
+ valA = parsehmmoutput(600,hmmoutputfolder + "valA_like.txt")
+ valaprots = valA[0]
+ valascores = valA[1]
+ for i in valA[0]:
+ amglyccyclprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["2-epi-5-epi-valiolone synthase, ValA-like",valascores[valaprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["2-epi-5-epi-valiolone synthase, ValA-like",valascores[valaprots.index(i)]]]
+ spcFG = parsehmmoutput(200,hmmoutputfolder + "spcFG_like.txt")
+ spcfgprots = spcFG[0]
+ spcfgscores = spcFG[1]
+ for i in spcFG[0]:
+ amglyccyclprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["SpcF/SpcG-like glycosyltransferase",spcfgscores[spcfgprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["SpcF/SpcG-like glycosyltransferase",spcfgscores[spcfgprots.index(i)]]]
+ spcDK_glyc = parsehmmoutput(600,hmmoutputfolder + "spcDK_like_glyc.txt")
+ spcdkglycprots = spcDK_glyc[0]
+ spcdkglycscores = spcDK_glyc[1]
+ for i in spcDK_glyc[0]:
+ amglyccyclprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["SpcD/SpcK-like thymidylyltransferase",spcdkglycscores[spcdkglycprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["SpcD/SpcK-like thymidylyltransferase",spcdkglycscores[spcdkglycprots.index(i)]]]
+ salQ = parsehmmoutput(480,hmmoutputfolder + "salQ.txt")
+ salqprots = salQ[0]
+ salqscores = salQ[1]
+ for i in salqprots:
+ amglyccyclprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["2-epi-5-epi-valiolone synthase, SalQ-like",salqscores[salqprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["2-epi-5-epi-valiolone synthase, SalQ-like",salqscores[salqprots.index(i)]]]
+ #Extract aminocoumarin biosynthesis clusters
+ aminocoumarinprots = []
+ if 1 in geneclustertypes or 11 in geneclustertypes:
+ novK = parsehmmoutput(200,hmmoutputfolder + "novK.txt")
+ novkprots = novK[0]
+ novkscores = novK[1]
+ for i in novkprots:
+ aminocoumarinprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["NovK-like reductase",novkscores[novkprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["NovK-like reductase",novkscores[novkprots.index(i)]]]
+ novJ = parsehmmoutput(350,hmmoutputfolder + "novJ.txt")
+ novjprots = novJ[0]
+ novjscores = novJ[1]
+ for i in novjprots:
+ aminocoumarinprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["NovJ-like reductase",novjscores[novjprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["NovJ-like reductase",novjscores[novjprots.index(i)]]]
+ novI = parsehmmoutput(600,hmmoutputfolder + "novI.txt")
+ noviprots = novI[0]
+ noviscores = novI[1]
+ for i in noviprots :
+ aminocoumarinprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["NovI-like cytochrome P450",noviscores[noviprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["NovI-like cytochrome P450",noviscores[noviprots.index(i)]]]
+ novH = parsehmmoutput(750,hmmoutputfolder + "novH.txt")
+ novhprots = novH[0]
+ novhscores = novH[1]
+ for i in novhprots:
+ aminocoumarinprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["NovH-like protein",novhscores[novhprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["NovH-like protein",novhscores[novhprots.index(i)]]]
+ spcDK_like_cou = parsehmmoutput(600,hmmoutputfolder + "spcDK_like_cou.txt")
+ spcDK_like_cou_prots = spcDK_like_cou[0]
+ spcDK_like_cou_scores = spcDK_like_cou[1]
+ for i in spcDK_like_cou_prots:
+ aminocoumarinprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["SpcD/SpcK-like thymidylyltransferase, aminocoumarins group",spcDK_like_cou_scores[spcDK_like_cou_prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["SpcD/SpcK-like thymidylyltransferase, aminocoumarins group",spcDK_like_cou_scores[spcDK_like_cou_prots.index(i)]]]
+ #Extract siderophores biosynthesis proteins, IucA/C and AlcB
+ siderophoreprots = []
+ if 1 in geneclustertypes or 12 in geneclustertypes:
+ siderophore = parsehmmoutput(30,hmmoutputfolder + "IucA_IucC.txt")
+ siderophoreprots = siderophore[0]
+ siderophorescores = siderophore[1]
+ for i in siderophoreprots:
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["IucA-IucC domain",siderophorescores[siderophoreprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["IucA-IucC domain",siderophorescores[siderophoreprots.index(i)]]]
+ #Extract ectoine biosynthesis proteins
+ ectprots = []
+ if 1 in geneclustertypes or 13 in geneclustertypes:
+ ect = parsehmmoutput(35,hmmoutputfolder + "ectoine_synt.txt")
+ ectprots = ect[0]
+ ectscores = ect[1]
+ for i in ectprots:
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Ectoine synthase",ectscores[ectprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Ectoine synthase",ectscores[ectprots.index(i)]]]
+ #Extract butyrolactone biosynthesis proteins
+ butyrprots = []
+ if 1 in geneclustertypes or 14 in geneclustertypes:
+ butyr= parsehmmoutput(25,hmmoutputfolder + "AfsA.txt")
+ butyrprots = butyr[0]
+ butyrscores = butyr[1]
+ for i in butyrprots:
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["AfsA butyrolactone synthesis domain",butyrscores[butyrprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["AfsA butyrolactone synthesis domain",butyrscores[butyrprots.index(i)]]]
+ #Extract indole biosynthesis proteins
+ indoleprots = []
+ if 1 in geneclustertypes or 15 in geneclustertypes:
+ indole = parsehmmoutput(100,hmmoutputfolder + "indsynth.txt")
+ indoleprots = indole[0]
+ indolescores = indole[1]
+ for i in indoleprots:
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["StaD-like chromopyrrolic acid synthase domain",indolescores[indoleprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["StaD-like chromopyrrolic acid synthase domain",indolescores[indoleprots.index(i)]]]
+ #Extract nucleoside antibiotic biosynthesis proteins
+ nucleoprots = []
+ if 1 in geneclustertypes or 16 in geneclustertypes:
+ nucleoprots = []
+ lipm = parsehmmoutput(50,hmmoutputfolder + "LipM.txt")
+ lipmprots = lipm[0]
+ lipmscores = lipm[1]
+ lipu = parsehmmoutput(30,hmmoutputfolder + "LipU.txt")
+ lipuprots = lipu[0]
+ lipuscores = lipu[1]
+ lipv = parsehmmoutput(375,hmmoutputfolder + "LipV.txt")
+ lipvprots = lipv[0]
+ lipvscores = lipv[1]
+ toyb = parsehmmoutput(175,hmmoutputfolder + "ToyB.txt")
+ toybprots = toyb[0]
+ toybscores = toyb[1]
+ tund = parsehmmoutput(200,hmmoutputfolder + "TunD.txt")
+ tundprots = tund[0]
+ tundscores = tund[1]
+ pur6 = parsehmmoutput(200,hmmoutputfolder + "pur6.txt")
+ pur6prots = pur6[0]
+ pur6scores = pur6[1]
+ pur10 = parsehmmoutput(600,hmmoutputfolder + "pur10.txt")
+ pur10prots = pur10[0]
+ pur10scores = pur10[1]
+ nikj = parsehmmoutput(200,hmmoutputfolder + "nikJ.txt")
+ nikjprots = nikj[0]
+ nikjscores = nikj[1]
+ niko = parsehmmoutput(400,hmmoutputfolder + "nikO.txt")
+ nikoprots = niko[0]
+ nikoscores = niko[1]
+ for i in lipmprots:
+ if i not in nucleoprots:
+ nucleoprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["LipM-like nucleotidyltransferase",lipmscores[lipmprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["LipM-like nucleotidyltransferase",lipmscores[lipmprots.index(i)]]]
+ for i in lipuprots:
+ if i not in nucleoprots:
+ nucleoprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["LipU-like protein",lipuscores[lipuprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["LipU-like protein",lipuscores[lipuprots.index(i)]]]
+ for i in lipvprots:
+ if i not in nucleoprots:
+ nucleoprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["LipV-like dehydrogenase",lipvscores[lipvprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["LipV-like dehydrogenase",lipvscores[lipvprots.index(i)]]]
+ for i in toybprots:
+ if i not in nucleoprots:
+ nucleoprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["ToyB-like synthase",toybscores[toybprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["ToyB-like synthase",toybscores[toybprots.index(i)]]]
+ for i in tundprots:
+ if i not in nucleoprots:
+ nucleoprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["TunD-like putative N-acetylglucosamine transferase",tundscores[tundprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["TunD-like putative N-acetylglucosamine transferase",tundscores[tundprots.index(i)]]]
+ for i in pur6prots:
+ if i not in nucleoprots:
+ nucleoprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Pur6-like synthetase",pur6scores[pur6prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Pur6-like synthetase",pur6scores[pur6prots.index(i)]]]
+ for i in pur10prots:
+ if i not in nucleoprots:
+ nucleoprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Pur10-like oxidoreductase",pur10scores[pur10prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Pur10-like oxidoreductase",pur10scores[pur10prots.index(i)]]]
+ for i in nikjprots:
+ if i not in nucleoprots:
+ nucleoprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["NikJ-like protein",nikjscores[nikjprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["NikJ-like protein",nikjscores[nikjprots.index(i)]]]
+ for i in nikoprots:
+ if i not in nucleoprots:
+ nucleoprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["NikO-like enolpyruvyl transferase",nikoscores[nikoprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+
+ detecteddomainsdict[i] = [["NikO-like enolpyruvyl transferase",nikoscores[nikoprots.index(i)]]]
+ #Extract phosphoglycolipid biosynthesis proteins
+ phosphoprots = []
+ if 1 in geneclustertypes or 17 in geneclustertypes:
+ phosphogl = parsehmmoutput(65,hmmoutputfolder + "MoeO5.txt")
+ phosphoprots = phosphogl[0]
+ phosphoscores = phosphogl[1]
+ for i in phosphoprots:
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["MoeO5-like prenyl-3-phosphoglycerate synthase",phosphoscores[phosphoprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["MoeO5-like prenyl-3-phosphoglycerate synthase",phosphoscores[phosphoprots.index(i)]]]
+ #Extract melanin biosynthesis proteins
+ melaninprots = []
+ if 1 in geneclustertypes or 18 in geneclustertypes:
+ melanin = parsehmmoutput(40,hmmoutputfolder + "melC.txt")
+ melaninprots = melanin[0]
+ melaninscores = melanin[1]
+ for i in melaninprots:
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["MelC-like melanin synthase",melaninscores[melaninprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["MelC-like melanin synthase",melaninscores[melaninprots.index(i)]]]
+ #Extract other putative secondary metabolite biosynthesis proteins
+ otherprots = []
+ amp_t_prots = []
+ if 1 in geneclustertypes or 19 in geneclustertypes:
+ pptb = parsehmmoutput(20,hmmoutputfolder + "PP-binding.txt")
+ pptbprots = pptb[0]
+ pptbscores = pptb[1]
+ cond = parsehmmoutput(20,hmmoutputfolder + "Condensation.txt")
+ amp = parsehmmoutput(20,hmmoutputfolder + "AMP-binding.txt")
+ ampprots = amp[0]
+ ampscores = amp[1]
+ ampox = parsehmmoutput(50,hmmoutputfolder + "A-OX.txt")
+ ampoxprots = ampox[0]
+ ampoxscores = ampox[1]
+ nad4 = parsehmmoutput(40,hmmoutputfolder + "NAD_binding_4.txt")
+ nad4prots = nad4[0]
+ nad4scores = nad4[1]
+ cprots = cond[0]
+ aprots = amp[0]
+ for i in ampox[0]:
+ if i not in aprots:
+ aprots.append(i)
+ nrpsprots2 = []
+ for i in cprots:
+ if i in aprots:
+ nrpsprots2.append(i)
+ tprots = pptb[0]
+ for i in tprots:
+ if i in aprots and i not in nrpsprots2 and i not in aminocoumarinprots:
+ otherprots.append(i)
+ amp_t_prots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["PP-binding domain",pptbscores[pptbprots.index(i)]])
+ if i in ampprots:
+ detdomlist.append(["Adenylation domain",ampscores[ampprots.index(i)]])
+ elif i in ampoxprots:
+ detdomlist.append(["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in ampprots:
+ detecteddomainsdict[i] = [["PP-binding domain",pptbscores[pptbprots.index(i)]],["Adenylation domain",ampscores[ampprots.index(i)]]]
+ elif i in ampoxprots:
+ detecteddomainsdict[i] = [["PP-binding domain",pptbscores[pptbprots.index(i)]],["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]]]
+ for i in nad4prots:
+ if i in aprots and i not in aminocoumarinprots:
+ otherprots.append(i)
+ amp_t_prots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["NAD-binding domain 4",nad4scores[nad4prots.index(i)]])
+ if i in ampprots:
+ detdomlist.append(["Adenylation domain",ampscores[ampprots.index(i)]])
+ elif i in ampoxprots:
+ detdomlist.append(["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ if i in ampprots:
+ detecteddomainsdict[i] = [["NAD-binding domain 4",nad4scores[nad4prots.index(i)]],["Adenylation domain",ampscores[ampprots.index(i)]]]
+ elif i in ampoxprots:
+ detecteddomainsdict[i] = [["NAD-binding domain 4",nad4scores[nad4prots.index(i)]],["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]]]
+ lmbu = parsehmmoutput(50,hmmoutputfolder + "LmbU.txt")
+ lmbuprots = lmbu[0]
+ lmbuscores = lmbu[1]
+ for i in lmbuprots:
+ if i not in otherprots:
+ otherprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["LmbU-like protein",lmbuscores[lmbuprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["LmbU-like protein",lmbuscores[lmbuprots.index(i)]]]
+ goadsporin = parsehmmoutput(500,hmmoutputfolder + "goadsporin_like.txt")
+ goadsporinprots = goadsporin[0]
+ goadsporinscores = goadsporin[1]
+ for i in goadsporinprots:
+ if i not in otherprots:
+ otherprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Goadsporin-like protein",goadsporinscores[goadsporinprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Goadsporin-like protein",goadsporinscores[goadsporinprots.index(i)]]]
+ neocarzinostat = parsehmmoutput(28,hmmoutputfolder + "Neocarzinostat.txt")
+ neocarzinostatprots = neocarzinostat[0]
+ neocarzinostatscores = neocarzinostat[1]
+ for i in neocarzinostatprots:
+ if i not in otherprots:
+ otherprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Neocarzinostatin-like protein",neocarzinostatscores[neocarzinostatprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Neocarzinostatin-like protein",neocarzinostatscores[neocarzinostatprots.index(i)]]]
+ cyanobactin = parsehmmoutput(80,hmmoutputfolder + "cyanobactin_synth.txt")
+ cyanobactinprots = cyanobactin[0]
+ cyanobactinscores = cyanobactin[1]
+ for i in cyanobactinprots:
+ if i not in otherprots:
+ otherprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Cyanobactin protease",cyanobactinscores[cyanobactinprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Cyanobactin protease",cyanobactinscores[cyanobactinprots.index(i)]]]
+ cycdipeptide = parsehmmoutput(110,hmmoutputfolder + "cycdipepsynth.txt")
+ cycdipeptideprots = cycdipeptide[0]
+ cycdipeptidescores = cycdipeptide[1]
+ for i in cycdipeptideprots:
+ if i not in otherprots:
+ otherprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Cyclodipeptide synthase",cycdipeptidescores[cycdipeptideprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Cyclodipeptide synthase",cycdipeptidescores[cycdipeptideprots.index(i)]]]
+ fom1 = parsehmmoutput(750,hmmoutputfolder + "fom1.txt")
+ fom1prots = fom1[0]
+ fom1scores = fom1[1]
+ for i in fom1prots:
+ if i not in otherprots:
+ otherprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Fom1-like phosphomutase",fom1scores[fom1prots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Fom1-like phosphomutase",fom1scores[fom1prots.index(i)]]]
+ bcpb = parsehmmoutput(400,hmmoutputfolder + "bcpB.txt")
+ bcpbprots = bcpb[0]
+ bcpbscores = bcpb[1]
+ for i in bcpbprots:
+ if i not in otherprots:
+ otherprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["BcpB-like phosphomutase",bcpbscores[bcpbprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["BcpB-like phosphomutase",bcpbscores[bcpbprots.index(i)]]]
+ frbd = parsehmmoutput(350,hmmoutputfolder + "frbD.txt")
+ frbdprots = frbd[0]
+ frbdscores = frbd[1]
+ for i in frbdprots:
+ if i not in otherprots:
+ otherprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["FrbD-like phosphomutase",frbdscores[frbdprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["FrbD-like phosphomutase",frbdscores[frbdprots.index(i)]]]
+ mite = parsehmmoutput(400,hmmoutputfolder + "mitE.txt")
+ miteprots = mite[0]
+ mitescores = mite[1]
+ for i in miteprots:
+ if i not in otherprots:
+ otherprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["MitE-like CoA-ligase",mitescores[miteprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["MitE-like CoA-ligase",mitescores[miteprots.index(i)]]]
+ vlmb = parsehmmoutput(250,hmmoutputfolder + "vlmB.txt")
+ vlmbprots = vlmb[0]
+ vlmbscores = vlmb[1]
+ for i in vlmbprots:
+ if i not in otherprots:
+ otherprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Valanimycin biosynthesis VlmB domain",vlmbscores[vlmbprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Valanimycin biosynthesis VlmB domain",vlmbscores[vlmbprots.index(i)]]]
+ prnb = parsehmmoutput(200,hmmoutputfolder + "prnB.txt")
+ prnbprots = prnb[0]
+ prnbscores = prnb[1]
+ for i in prnbprots:
+ if i not in otherprots:
+ otherprots.append(i)
+ if detecteddomainsdict.has_key(i):
+ detdomlist = detecteddomainsdict[i]
+ detdomlist.append(["Pyrrolnitrin biosynthesis PrnB domain",prnbscores[prnbprots.index(i)]])
+ detecteddomainsdict[i] = detdomlist
+ else:
+ detecteddomainsdict[i] = [["Pyrrolnitrin biosynthesis PrnB domain",prnbscores[prnbprots.index(i)]]]
+ if 5 not in geneclustertypes and 1 not in geneclustertypes:
+ nrpsprots = []
+ if 4 not in geneclustertypes and 1 not in geneclustertypes:
+ t3pksprots = []
+ if 3 not in geneclustertypes and 1 not in geneclustertypes:
+ t2pksprots = []
+ if 2 not in geneclustertypes and 1 not in geneclustertypes:
+ t1pksprots = []
+ t4pksprots = []
+ transatpksprots = []
+ #Assemble all core sec met proteins
+ allsecmetprots = []
+ for i in t1pksprots:
+ if i not in allsecmetprots:
+ allsecmetprots.append(i)
+ for i in transatpksprots:
+ if i not in allsecmetprots:
+ allsecmetprots.append(i)
+ for i in t2pksprots:
+ if i not in allsecmetprots:
+ allsecmetprots.append(i)
+ for i in t3pksprots:
+ if i not in allsecmetprots:
+ allsecmetprots.append(i)
+ for i in t4pksprots:
+ if i not in allsecmetprots:
+ allsecmetprots.append(i)
+ for i in nrpsprots:
+ if i not in allsecmetprots:
+ allsecmetprots.append(i)
+ for i in terpeneprots:
+ if i not in allsecmetprots:
+ allsecmetprots.append(i)
+ for i in lantprots:
+ if i not in allsecmetprots:
+ allsecmetprots.append(i)
+ for i in bcinprots:
+ if i not in allsecmetprots:
+ allsecmetprots.append(i)
+ for i in lactamprots:
+ if i not in allsecmetprots:
+ allsecmetprots.append(i)
+ for i in amglyccyclprots:
+ if i not in allsecmetprots:
+ allsecmetprots.append(i)
+ for i in siderophoreprots:
+ if i not in allsecmetprots:
+ allsecmetprots.append(i)
+ for i in ectprots:
+ if i not in allsecmetprots:
+ allsecmetprots.append(i)
+ for i in butyrprots:
+ if i not in allsecmetprots:
+ allsecmetprots.append(i)
+ for i in indoleprots:
+ if i not in allsecmetprots:
+ allsecmetprots.append(i)
+ for i in nucleoprots:
+ if i not in allsecmetprots:
+ allsecmetprots.append(i)
+ for i in phosphoprots:
+ if i not in allsecmetprots:
+ allsecmetprots.append(i)
+ for i in melaninprots:
+ if i not in allsecmetprots:
+ allsecmetprots.append(i)
+ for i in aminocoumarinprots:
+ if i not in allsecmetprots:
+ allsecmetprots.append(i)
+ for i in otherprots:
+ if i not in allsecmetprots:
+ allsecmetprots.append(i)
+ allsecmetprots.sort()
+
+ if len(allsecmetprots) == 0:
+ logfile.write("No secondary metabolite biosynthesis gene clusters detected in this nucleotide file.\n")
+ logfile.close()
+ print >> sys.stderr, "No secondary metabolite biosynthesis gene clusters detected in this nucleotide file."
+ sys.exit(1)
+
+ elapsed = (time.time() - starttime)
+ #print "4713Time since start: " + str(elapsed)
+
+ #Extract approximate gene clusters based on hmmsearch results, create list of core PKS / NRPS genes for further analysis (use less strict parameters for this then in gene cluster detection to include all PKS/NRPS domains)
+ #Create nucleotide fasta files with sec met gene clusters
+ #print "Extracting gene clusters from gbk/embl file using detected signature genes..."
+ logfile.write("Extracting gene clusters from gbk/embl file using detected signature genes...\n")
+ fastafile = open(genomename + "/clusterblast/geneclusterprots.fasta","w")
+ txtfile = open(genomename + "/clusterblast/geneclusters.txt","w")
+ wb = Workbook()
+ font1 = Font()
+ style1 = XFStyle()
+ style1.font = font1
+ font1.bold = True
+ ws0 = wb.add_sheet('0')
+ ws0.write(0,0,"Input accession number",style1)
+ ws0.write(0,1,"Input name",style1)
+ ws0.write(0,2,"Gene cluster type",style1)
+ ws0.write(0,3,"Gene cluster genes",style1)
+ if clusterblast == "y":
+ ws0.write(0,4,"Compound with gene cluster of highest homology",style1)
+ protcodes = allsecmetprots
+ nuccode = genomename
+ gbkfile = open(infile,"r")
+ output = gbkfile.read()
+ output = output.replace("\r","\n")
+ #Extract description of nucleotide from gbk/embl file
+ if ".gbk" in infile or ".GBK" in infile or ".gb" in infile or ".GB" in infile or ".genbank" in infile or ".GENBANK" in infile:
+ try:
+ nucname1 = output.split("ACCESSION ")[0]
+ nucname2 = nucname1.split("DEFINITION ")[1]
+ nucname3 = nucname2.replace("\n","")
+ while " " in nucname3:
+ nucname3 = nucname3.replace(" "," ")
+ nucname = nucname3
+ except(KeyError,IOError,IndexError):
+ nucname = "input_nucleotide"
+ elif ".embl" in infile or ".EMBL" in infile or ".emb" in infile or ".EMB" in infile:
+ try:
+ nucname1 = output.split("DE ")[1]
+ nucname2 = nucname1.split("\n")[0]
+ nucname3 = nucname2.replace("\n","")
+ while " " in nucname3:
+ nucname3 = nucname3.replace(" "," ")
+ nucname = nucname3
+ except(KeyError,IOError,IndexError):
+ nucname = "input_nucleotide"
+ protstartlocations = []
+ protendlocations = []
+ genelist = proteins[2]
+ genedict = proteins[3]
+ #Save all locations of query proteins on the nucleotide in a list
+ for j in protcodes:
+ if j in genelist:
+ protstart_abs = min([int(genedict[j][0]),int(genedict[j][1])])
+ protend_abs = max([int(genedict[j][0]),int(genedict[j][1])])
+ protstartlocations.append(protstart_abs)
+ protendlocations.append(protend_abs)
+ #Identify clusters of genes based on protein locations on the nucleotide
+ clusterstarts = []
+ clusterends = []
+ protstartlocations.sort()
+ protendlocations.sort()
+ nrlocations = len(protstartlocations)
+ a = 0
+ for i in protstartlocations:
+ if a == 0:
+ start = str(i)
+ clusterstarts.append(start)
+ if len(protendlocations) == 1:
+ clusterends.append(protendlocations[a])
+ elif a == nrlocations - 1:
+ if i < ((protendlocations[a - 1]) + 20000):
+ clusterends.append(str(protendlocations[a]))
+ else:
+ end = str(protendlocations[a - 1])
+ clusterends.append(end)
+ clusterstarts.append(str(i))
+ clusterends.append(str(protendlocations[a]))
+ else:
+ if i > ((protendlocations[a - 1]) + 20000):
+ clusterends.append(str(protendlocations[a - 1]))
+ start = str(i)
+ clusterstarts.append(start)
+ else:
+ pass
+ a += 1
+ lastendlocation = i
+ #Extend clusters with 20kb on each side of the identified core genes
+ clusterstarts2 = []
+ for i in clusterstarts:
+ j = int(i) - 20000
+ if j < 0:
+ j = 0
+ clusterstarts2.append(j)
+ clusterstarts = clusterstarts2
+ clusterends2 = []
+ for i in clusterends:
+ j = int(i) + 20000
+ clusterends2.append(j)
+ clusterends = clusterends2
+ #For each genbank secondary metabolite gene cluster: extract all proteins and write to fasta,
+ a = 0
+ clusterinfo = {}
+ geneclusters = []
+ geneclustergenes = []
+ allcoregenes = []
+ for i in clusterstarts:
+ cstart = int(i)
+ cend = int(clusterends[a])
+ a += 1
+ clusternr = a
+ geneclusters.append(clusternr)
+ coregenes = []
+ clustergenes = []
+ #For each gene in nucleotide, check if it is inside this cluster; if, so append info to list of clustergenes
+ if a == 1:
+ for i in genelist:
+ geneinfo = genedict[i][:-1]
+ geneinfo.append(i)
+ genedict[i] = geneinfo
+ for i in genelist:
+ geneinfo = genedict[i]
+ genestart = int(geneinfo[0])
+ geneend = int(geneinfo[1])
+ if (genestart > cstart and genestart < cend) or (geneend > cstart and geneend < cend):
+ clustergenes.append(geneinfo)
+ #Determine type of cluster
+ type = "other"
+ z = 0
+ for k in clustergenes:
+ i = k[4]
+ if i in t1pksprots:
+ if z == 0:
+ type = "t1pks"
+ elif "t1pks" not in type:
+ type = type + "-t1pks"
+ z = 1
+ if i in transatpksprots:
+ if z == 0:
+ type = "transatpks"
+ elif "transatpks" not in type:
+ type = type + "-transatpks"
+ z = 1
+ if i in t2pksprots:
+ if z == 0:
+ type = "t2pks"
+ elif "t2pks" not in type:
+ type = type + "-t2pks"
+ z = 1
+ if i in t3pksprots:
+ if z == 0:
+ type = "t3pks"
+ elif "t3pks" not in type:
+ type = type + "-t3pks"
+ z = 1
+ if i in t4pksprots:
+ if z == 0:
+ type = "t1pks"
+ elif "t1pks" not in type:
+ type = type + "-t1pks"
+ z = 1
+ if i in nrpsprots:
+ if z == 0:
+ type = "nrps"
+ elif "nrps" not in type:
+ type = type + "-nrps"
+ z = 1
+ if i in terpeneprots:
+ if z == 0:
+ type= "terpene"
+ elif "terpene" not in type:
+ type = type + "-terpene"
+ z = 1
+ if i in lantprots:
+ if z == 0:
+ type= "lant"
+ elif "lant" not in type:
+ type = type + "-lant"
+ z = 1
+ if i in bcinprots:
+ if z == 0:
+ type= "bcin"
+ elif "bcin" not in type:
+ type = type + "-bcin"
+ z = 1
+ if i in lactamprots:
+ if z == 0:
+ type = "blactam"
+ elif "blactam" not in type:
+ type = type + "-blactam"
+ z = 1
+ if i in amglyccyclprots:
+ if z == 0:
+ type = "amglyccycl"
+ elif "amglyccycl" not in type:
+ type = type + "-amglyccycl"
+ z = 1
+ if i in siderophoreprots:
+ if z == 0:
+ type = "siderophore"
+ elif "siderophore" not in type:
+ type = type + "-siderophore"
+ z = 1
+ if i in ectprots:
+ if z == 0:
+ type = "ectoine"
+ elif "ectoine" not in type:
+ type = type + "-ectoine"
+ z = 1
+ if i in indoleprots:
+ if z == 0:
+ type = "indole"
+ elif "indole" not in type:
+ type = type + "-indole"
+ z = 1
+ if i in nucleoprots:
+ if z == 0:
+ type = "nucleoside"
+ elif "nucleoside" not in type:
+ type = type + "-nucleoside"
+ z = 1
+ if i in phosphoprots:
+ if z == 0:
+ type = "phosphoglycolipid"
+ elif "phosphoglycolipid" not in type:
+ type = type + "-phosphoglycolipid"
+ z = 1
+ if i in butyrprots:
+ if z == 0:
+ type = "butyrolactone"
+ elif "butyrolactone" not in type:
+ type = type + "-butyrolactone"
+ z = 1
+ if i in melaninprots:
+ if z == 0:
+ type = "melanin"
+ elif "melanin" not in type:
+ type = type + "-melanin"
+ z = 1
+ if i in aminocoumarinprots:
+ if z == 0:
+ type = "aminocoumarin"
+ elif "aminocoumarin" not in type:
+ type = type + "-aminocoumarin"
+ z = 1
+ if "other-" in type[:6]:
+ type = type[6:]
+ #Shorten gene cluster if type is among typically short gene cluster types
+ if cend > dnaseqlength:
+ cend = dnaseqlength
+ if type == "t3pks" or type == "t2pks":
+ if cstart != 0:
+ cstart = cstart + 5000
+ if cend != dnaseqlength:
+ cend = cend - 5000
+ clustergenes2 = []
+ for i in clustergenes:
+ start = int(i[0])
+ end = int(i[1])
+ if (start > cstart and start < cend) or (end > cstart and end < cend):
+ clustergenes2.append(i)
+ clustergenes = clustergenes2
+ if type == "bcin" or type == "siderophore" or type == "lant" or type == "terpene":
+ if cstart != 0:
+ cstart = cstart + 10000
+ if cend != dnaseqlength:
+ cend = cend - 10000
+ clustergenes2 = []
+ for i in clustergenes:
+ start = int(i[0])
+ end = int(i[1])
+ if (start > cstart and start < cend) or (end > cstart and end < cend):
+ clustergenes2.append(i)
+ clustergenes = clustergenes2
+ if type == "butyrolactone" or type == "melanin" or type == "ectoine":
+ if cstart != 0:
+ cstart = cstart + 17000
+ if cend != dnaseqlength:
+ cend = cend - 17000
+ clustergenes2 = []
+ for i in clustergenes:
+ start = int(i[0])
+ end = int(i[1])
+ if (start > cstart and start < cend) or (end > cstart and end < cend):
+ clustergenes2.append(i)
+ clustergenes = clustergenes2
+ #For all clustergenes, write info to fasta
+ for i in clustergenes:
+ start = str(i[0])
+ end = str(i[1])
+ strand = i[2]
+ seq = seqdict[i[4]]
+ ann = i[3].replace(" ","_")
+ accession = i[4]
+ name = nuccode + "|c" + str(a) + "|" + start + "-" + end + "|" + strand + "|" + accession + "|" + ann
+ fastafile.write(">" + name + "\n" + seq + "\n")
+ if accession not in geneclustergenes:
+ geneclustergenes.append(accession)
+ #Write gene cluster info to separate txt file
+ txtfile.write(nuccode + "\t" + nucname + "\t" + "c" + str(a) + "\t" + type + "\t")
+ ws0.write(a,0,genomic_accnr)
+ try:
+ ws0.write(a,1,nucname)
+ except:
+ ws0.write(a,1,"Name to long to be contained in Excel cell; see txt file in downloadable zip archive.")
+ ws0.write(a,2,type)
+ xlsgenesfield = ""
+ for i in clustergenes:
+ txtfile.write(i[4] + ";")
+ xlsgenesfield = xlsgenesfield + i[4] + ";"
+ txtfile.write("\t")
+ for i in clustergenes:
+ txtfile.write(accessiondict[i[4]] + ";")
+ xlsgenesfield = xlsgenesfield[:-1]
+ try:
+ ws0.write(a,3,xlsgenesfield)
+ except:
+ ws0.write(a,3,"Too many genes to be contained in Excel cell; see txt file in downloadable zip archive.")
+ txtfile.write("\n")
+ #Write gene cluster info to clusterinfo dictionary
+ for i in clustergenes:
+ if i[4] in allsecmetprots:
+ coregenes.append(i[4])
+ allcoregenes.append(i[4])
+ clusterinfo[clusternr] = [type,cstart,cend,coregenes,clustergenes]
+ #Close xls, fasta and txt files
+ fastafile.close()
+ txtfile.close()
+
+ #Analysis of core PKS/NRPS genes (separate py), detect subgroups and predict specificities and final products
+ #Make list of PKS / NRPS gene clusters to be analysed
+ #print "Analysing core PKS/NRPS genes..."
+ logfile.write("Analysing core PKS/NRPS genes...\n")
+ pksnrpsgeneclusters = []
+ pksnrpscoregenes = []
+ for i in geneclusters:
+ if "t1pks" in clusterinfo[i][0] or "t4pks" in clusterinfo[i][0] or "transatpks" in clusterinfo[i][0] or "nrps" in clusterinfo[i][0]:
+ pksnrpsgeneclusters.append(i)
+ for i in t1pksprots:
+ pksnrpscoregenes.append(i)
+ for i in transatpksprots:
+ pksnrpscoregenes.append(i)
+ for i in t4pksprots:
+ pksnrpscoregenes.append(i)
+ for i in nrpsprots:
+ pksnrpscoregenes.append(i)
+ for i in amp_t_prots:
+ pksnrpscoregenes.append(i)
+ pksnrpsgenestartdict = {}
+ for i in pksnrpscoregenes:
+ start = int(genedict[i][0])
+ pksnrpsgenestartdict[i] = start
+ pksnrpscoregenes = sortdictkeysbyvalues(pksnrpsgenestartdict)
+ nrpsnames = []
+ nrpsseqs = []
+ pksnrpsnames = []
+ pksnrpsseqs = []
+ pksnames = []
+ pksseqs = []
+ calnames = []
+ calseqs = []
+ krnames = []
+ krseqs = []
+ nrpspkstypedict = {}
+ domaindict = {}
+ if len(pksnrpscoregenes) > 0:
+ #Write PKS / NRPS core genes to FASTA file
+ for i in pksnrpscoregenes:
+ name = i
+ seq = seqdict[i]
+ pksnrpsnames.append(name)
+ pksnrpsseqs.append(seq)
+ writefasta(pksnrpsnames,pksnrpsseqs,genomename + "/nrpspks_proteins.fasta")
+ #Analyse for abMotifs
+ hmmsearch = hmmscan_path + " --cpu " + str(nrcpus) + " -E 0.1 -o " + genomename + "/nrpspks/abmotifshmm_output.txt" + " --noali --tblout " + genomename + "/nrpspks/abmotifshmm.txt "+ hmms_path +"abmotifs.hmm " + genomename + "/nrpspks_proteins.fasta"
+ os.system(hmmsearch)
+ mhmmlengthsdict = hmmlengths(hmms_path+"abmotifs.hmm")
+ motifdict = hmmscanparse(genomename + "/nrpspks/abmotifshmm_output.txt",mhmmlengthsdict)
+ #Analyse for C/A/PCP/E/KS/AT/ATd/DH/KR/ER/ACP/TE/TD/COM/Docking/MT/CAL domains
+ hmmsearch = hmmscan_path + " --cut_tc --cpu " + str(nrcpus) + " -o " + genomename + "/nrpspks/nrpspkshmm_output.txt" + " --noali --tblout " + genomename + "/nrpspks/nrpspkshmm.txt "+ hmms_path +"nrpspksdomains.hmm " + genomename + "/nrpspks_proteins.fasta"
+ os.system(hmmsearch)
+ hmmlengthsdict = hmmlengths(hmms_path+"nrpspksdomains.hmm")
+ domaindict = hmmscanparse(genomename + "/nrpspks/nrpspkshmm_output.txt",hmmlengthsdict)
+ nrpspksdomainsfile = open(genomename + "/nrpspks/nrpspksdomains.txt","w")
+ #Analyse KS domains & PKS/NRPS protein domain composition to detect NRPS/PKS types
+ kshmmsearch = hmmscan_path + " --cut_tc --cpu " + str(nrcpus) + " -o " + genomename + "/nrpspks/kshmm_output.txt" + " --noali --tblout " + genomename + "/nrpspks/kshmm.txt " + hmms_path + "ksdomains.hmm " + genomename + "/nrpspks_proteins.fasta"
+ os.system(kshmmsearch)
+ kshmmlengthsdict = hmmlengths(hmms_path+"ksdomains.hmm")
+ ksdomaindict = hmmscanparse(genomename + "/nrpspks/kshmm_output.txt",kshmmlengthsdict)
+ for k in pksnrpscoregenes:
+ #structure of domaindict: domaindict[genename] = [[name,start,end,evalue,score],[name,start,end,evalue,score], etc.]
+ domainlist = []
+ nrKSdomains = 0
+ for i in domaindict[k]:
+ domainlist.append(i[0])
+ if i[0] == "PKS_KS":
+ nrKSdomains += 1
+ modKSscore = 0
+ traKSscore = 0
+ eneKSscore = 0
+ iterKSscore = 0
+ for i in ksdomaindict[k]:
+ if i[0] == "Trans-AT-KS":
+ traKSscore += 1
+ if i[0] == "Modular-KS":
+ modKSscore += 1
+ if i[0] == "Enediyne-KS":
+ eneKSscore += 1
+ if i[0] == "Iterative-KS":
+ iterKSscore += 1
+ for i in domaindict[k]:
+ if "Cglyc" in domainlist and "Epimerization" in domainlist and "AMP-binding" in domainlist and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist:
+ type = "Glycopeptide NRPS"
+ elif ("Condensation_LCL" in domainlist or "Condensation_DCL" in domainlist or "Condensation_Starter" in domainlist or "Cglyc" in domainlist or "Condensation_Dual" in domainlist) and "AMP-binding" in domainlist and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist:
+ type = "NRPS"
+ elif ("Condensation_LCL" in domainlist or "Condensation_DCL" in domainlist or "Condensation_Starter" in domainlist or "Cglyc" in domainlist or "Condensation_Dual" in domainlist) or "AMP-binding" in domainlist and ("PKS_KS" in domainlist or "PKS_AT" in domainlist):
+ type = "Hybrid PKS-NRPS"
+ elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist) and "PKS_KS" in domainlist and "PKS_AT" not in domainlist and "Trans-AT_docking" in domainlist and traKSscore > modKSscore and traKSscore > iterKSscore and traKSscore > eneKSscore:
+ type = "Type I Trans-AT PKS"
+ elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and iterKSscore > modKSscore and iterKSscore > traKSscore and iterKSscore > eneKSscore and nrKSdomains < 3:
+ type = "Type I Iterative PKS"
+ elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and eneKSscore > modKSscore and eneKSscore > traKSscore and eneKSscore > iterKSscore and nrKSdomains < 3:
+ type = "Type I Enediyne PKS"
+ elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and ((modKSscore > eneKSscore and modKSscore > traKSscore and modKSscore > iterKSscore) or nrKSdomains > 3):
+ type = "Type I Modular PKS"
+ elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist) and "PKS_KS" in domainlist and "PKS_AT" in domainlist:
+ type = "PKS-like protein"
+ elif ("Condensation_LCL" in domainlist or "Condensation_DCL" in domainlist or "Condensation_Starter" in domainlist or "Cglyc" in domainlist or "Condensation_Dual" in domainlist or "AMP-binding" in domainlist) and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist:
+ type = "NRPS-like protein"
+ else:
+ type = "PKS/NRPS-like protein"
+ nrpspkstypedict[k] = type
+ #Write data to output file
+ for k in pksnrpscoregenes:
+ j = domaindict[k]
+ l = motifdict[k]
+ nrpspksdomainsfile.write(">> " + k + "\n")
+ nrpspksdomainsfile.write(">> " + nrpspkstypedict[k] + "\n")
+ nrpspksdomainsfile.write("name\tstart\tend\te-value\tscore\n")
+ for i in j:
+ #nrpspksdomainsfile.write(str(i[0]) + "\t" + str(i[1]) + "\t" + str(i[2]) + "\t" + str(i[3]) + "\t" + str(i[4]) + "\n")
+ nrpspksdomainsfile.write("%s\t%s\t%s\t%s\t%s\n" % (i[0], i[1], i[2], i[3], i[4]) )
+ nrpspksdomainsfile.write("** Motifs: **\n")
+ for i in l:
+ #nrpspksdomainsfile.write(str(i[0]) + "\t" + str(i[1]) + "\t" + str(i[2]) + "\t" + str(i[3]) + "\t" + str(i[4]) + "\n")
+ nrpspksdomainsfile.write("%s\t%s\t%s\t%s\t%s\n" % (i[0], i[1], i[2], i[3], i[4]) )
+ nrpspksdomainsfile.write("\n\n")
+ nrpspksdomainsfile.close()
+
+ elapsed = (time.time() - starttime)
+ #print "5163Time since start: " + str(elapsed)
+
+ #Predict NRPS A domain specificities with NRPSPredictor and Minowa et al. method
+ #print "Predicting NRPS A domain substrate specificities by NRPSPredictor"
+ logfile.write("Predicting NRPS A domain substrate specificities by NRPSPredictor\n")
+ #NRPSPredictor: extract AMP-binding + 120 residues N-terminal of this domain, extract 8 Angstrom residues and insert this into NRPSPredictor
+ for k in pksnrpscoregenes:
+ j = domaindict[k]
+ nr = 0
+ for i in j:
+ if i[0] == "AMP-binding" or i[0] == "A-OX":
+ nr += 1
+ start = int(i[1])
+ end = int(i[2]) + 120
+ seq = seqdict[k][start:end]
+ name = k + "_A" + str(nr)
+ nrpsnames.append(name)
+ nrpsseqs.append(seq)
+ if len(nrpsnames) > 0:
+ writefasta(nrpsnames,nrpsseqs,"NRPSPredictor2/nrpsseqs.fasta")
+ #nrpspredcommand = "perl nrpsSpecPredictor.pl nrpsseqs.fasta ../" + nrpspredictoroutputfolder + " ." #OLD NRPSPREDICTOR1 command
+ os.chdir("NRPSPredictor2/")
+ #Get NRPSPredictor2 code predictions, output sig file for input for NRPSPredictor2 SVMs
+ if sys.platform == ('win32'):
+ nrpspred2codecommand = 'nrpscodepred nrpsseqs.fasta input.sig nrpscodes.txt > nul'
+ if sys.platform == ('linux2'):
+ nrpspred2codecommand = 'python nrpscodepred.py nrpsseqs.fasta input.sig nrpscodes.txt > /dev/null'
+ os.system(nrpspred2codecommand)
+ #Run NRPSPredictor2 SVM
+ currentdir = os.getcwd()
+ if sys.platform == ('win32'):
+ nrpspred2command = 'java -Ddatadir="' + currentdir + '\\data" -cp build/NRPSpredictor2.jar;lib/java-getopt-1.0.13.jar;lib/Utilities.jar;lib/libsvm.jar org.roettig.NRPSpredictor2.NRPSpredictor2 -i input.sig -r ..\\' + nrpspredictoroutputfolder + 'nrpspredictor2.out -s 1'
+ if sys.platform == ('linux2'):
+ nrpspred2command = './NRPSpredictor2.sh -i input.sig -r ../' + nrpspredictoroutputfolder + 'nrpspredictor2.out -s 1'
+ os.popen(nrpspred2command)
+ #Copy NRPSPredictor results
+ if sys.platform == ('win32'):
+ copycommand = 'copy/y nrpscodes.txt ..\\' + nrpspredictoroutputfolder.replace("/","\\") + ' > nul'
+ if sys.platform == ('linux2'):
+ copycommand = 'cp nrpscodes.txt ../' + nrpspredictoroutputfolder + " > /dev/null"
+ os.system(copycommand)
+ os.chdir("..")
+ elapsed = (time.time() - starttime)
+ #print "5206Time since start: " + str(elapsed)
+ # folgendes bis zum naechsten time braucht 500s, liegt wohl haupsaechlich an schlechtem minowa_A code
+ #Minowa method: extract AMP-binding domain, and run Minowa_A
+ if len(nrpsnames) > 0:
+ #print "Predicting NRPS A domain substrate specificities by Minowa et al. method\n"
+ logfile.write("Predicting NRPS A domain substrate specificities by Minowa et al. method")
+ nrpsnames2 = []
+ nrpsseqs2 = []
+ for k in pksnrpscoregenes:
+ j = domaindict[k]
+ nr = 0
+ for i in j:
+ if i[0] in ["AMP-binding", "A-OX"]:
+ nr += 1
+ start = int(i[1])
+ end = int(i[2])
+ seq = seqdict[k][start:end]
+ name = k + "_A" + str(nr)
+ nrpsnames2.append(name)
+ nrpsseqs2.append(seq)
+ writefasta(nrpsnames2,nrpsseqs2,minowanrpsoutputfolder + "nrpsseqs.fasta")
+ if sys.platform == ('win32'):
+ minowanrpscommand = "minowa_A ../" + minowanrpsoutputfolder + "nrpsseqs.fasta ../" + minowanrpsoutputfolder + "nrpspredoutput.txt"
+ if sys.platform == ('linux2'):
+ minowanrpscommand = "python minowa_A.py ../" + minowanrpsoutputfolder + "nrpsseqs.fasta ../" + minowanrpsoutputfolder + "nrpspredoutput.txt"
+ os.chdir("Minowa/")
+ os.system(minowanrpscommand)
+ os.chdir("..")
+
+ elapsed = (time.time() - starttime)
+ #print "5235Time since start: " + str(elapsed)
+ #Predict PKS AT domain specificities with Minowa et al. method and PKS code (NP searcher / ClustScan / own?)
+ for k in pksnrpscoregenes:
+ j = domaindict[k]
+ nr = 0
+ for i in j:
+ if i[0] == "PKS_AT":
+ nr += 1
+ start = int(i[1])
+ end = int(i[2])
+ seq = seqdict[k][start:end]
+ name = k + "_AT" + str(nr)
+ pksnames.append(name)
+ pksseqs.append(seq)
+ if len(pksnames) > 0:
+ writefasta(pksnames,pksseqs,pkssignatureoutputfolder + "pksseqs.fasta")
+ writefasta(pksnames,pksseqs,minowapksoutputfolder + "pksseqs.fasta")
+ #Run PKS signature analysis
+ elapsed = (time.time() - starttime)
+ #print "5254Time since start: " + str(elapsed)
+ print "Predicting PKS AT domain substrate specificities by Yadav et al. PKS signature sequences"
+ logfile.write("Predicting PKS AT domain substrate specificities by Yadav et al. PKS signature sequences\n")
+ if sys.platform == ('win32'):
+ pkspredcommand = "PKS_analysis ../" + pkssignatureoutputfolder + "pksseqs.fasta ../" + pkssignatureoutputfolder + "pkspredoutput.txt"
+ if sys.platform == ('linux2'):
+ pkspredcommand = "python PKS_analysis.py ../" + pkssignatureoutputfolder + "pksseqs.fasta ../" + pkssignatureoutputfolder + "pkspredoutput.txt"
+ os.chdir("pkssignatures/")
+ os.system(pkspredcommand)
+ os.chdir("..")
+ #Minowa method: run Minowa_AT
+ elapsed = (time.time() - starttime)
+ #print "5266Time since start: " + str(elapsed)
+ print "Predicting PKS AT domain substrate specificities by Minowa et al. method"
+ logfile.write("Predicting PKS AT domain substrate specificities by Minowa et al. method\n")
+ if sys.platform == ('win32'):
+ minowapkscommand = "minowa_AT ../" + minowapksoutputfolder + "pksseqs.fasta ../" + minowapksoutputfolder + "pkspredoutput.txt"
+ if sys.platform == ('linux2'):
+ minowapkscommand = "python minowa_AT.py ../" + minowapksoutputfolder + "pksseqs.fasta ../" + minowapksoutputfolder + "pkspredoutput.txt"
+ os.chdir("Minowa/")
+ os.system(minowapkscommand)
+ os.chdir("..")
+
+ #Predict PKS CAL domain specificities with Minowa et al. method
+ elapsed = (time.time() - starttime)
+ #print "5279Time since start: " + str(elapsed)
+ print "Predicting CAL domain substrate specificities by Minowa et al. method"
+ logfile.write("Predicting CAL domain substrate specificities by Minowa et al. method\n")
+ for k in pksnrpscoregenes:
+ j = domaindict[k]
+ nr = 0
+ for i in j:
+ if i[0] == "CAL_domain":
+ nr += 1
+ start = int(i[1])
+ end = int(i[2])
+ seq = seqdict[k][start:end]
+ name = k + "_CAL" + str(nr)
+ calnames.append(name)
+ calseqs.append(seq)
+ if len(calnames) > 0:
+ writefasta(calnames,calseqs,minowacaloutputfolder + "calseqs.fasta")
+ if sys.platform == ('win32'):
+ minowacalcommand = "minowa_CAL ../" + minowacaloutputfolder + "calseqs.fasta ../" + minowacaloutputfolder + "calpredoutput.txt"
+ if sys.platform == ('linux2'):
+ minowacalcommand = "python minowa_CAL.py ../" + minowacaloutputfolder + "calseqs.fasta ../" + minowacaloutputfolder + "calpredoutput.txt"
+ os.chdir("Minowa/")
+ os.system(minowacalcommand)
+ os.chdir("..")
+
+ elapsed = (time.time() - starttime)
+ #print "5305Time since start: " + str(elapsed)
+ #Predict PKS KR domain stereochemistry using pattern as published in ClustScan
+ print "Predicting PKS KR activity and stereochemistry using KR fingerprints from Starcevic et al."
+ logfile.write("Predicting PKS KR activity and stereochemistry using KR fingerprints from Starcevic et al.\n")
+ for k in pksnrpscoregenes:
+ j = domaindict[k]
+ nr = 0
+ for i in j:
+ if i[0] == "PKS_KR":
+ nr += 1
+ start = int(i[1])
+ end = int(i[2])
+ seq = seqdict[k][start:end]
+ name = k + "_KR" + str(nr)
+ krnames.append(name)
+ krseqs.append(seq)
+ if len(krnames) > 0:
+ writefasta(krnames,krseqs,kranalysisoutputfolder + "krseqs.fasta")
+ if sys.platform == ('win32'):
+ kranalysiscommand = "kr_analysis ../" + kranalysisoutputfolder + "krseqs.fasta ../" + kranalysisoutputfolder + "krpredoutput.txt"
+ if sys.platform == ('linux2'):
+ kranalysiscommand = "python kr_analysis.py ../" + kranalysisoutputfolder + "krseqs.fasta ../" + kranalysisoutputfolder + "krpredoutput.txt"
+ os.chdir("kr_analysis/")
+ os.system(kranalysiscommand)
+ os.chdir("..")
+
+ #Read and parse all substrate specificity prediction output files
+ minowa_nrps_preds = {}
+ minowa_nrps_preds_details = {}
+ nrps_svm_preds = {}
+ nrps_svm_preds_details = {}
+ nrps_code_preds = {}
+ nrps_code_preds_details = {}
+ substratetransdict2 = {'pipecolate':'pip','fOHOrn':'orn','beta-Lys':'blys','5NhOrn':'orn','OHOrn':'orn','Aad':'Aaa','bOHTyr':'bht'}
+ if len(nrpsnames) > 0:
+ minowa_a_file = open(minowanrpsoutputfolder + "nrpspredoutput.txt","r")
+ minowa_a_file = minowa_a_file.read()
+ minowa_a_file = minowa_a_file.replace("\r","\n")
+ parts = minowa_a_file.split("\\\\\n")[1:]
+ for i in parts:
+ partlines = i.split("\n")
+ acc = partlines[0]
+ tophit = partlines[2].split("\t")[0]
+ if tophit in substratetransdict2.keys():
+ tophit = substratetransdict2[tophit]
+ minowa_nrps_preds[acc] = tophit.lower()
+ minowa_nrps_preds_details[acc] = "Minowa HMM method A-domain
Substrate specificity prediction top hits:
\n" + partlines[1] + "
\n" + partlines[2] + "
\n" + partlines[3] + "
\n" + partlines[4] + "
\n\n"
+ nrpspredictorfile1 = open(nrpspredictoroutputfolder + "nrpspredictor2.out","r")
+ nrpspredictorfile2 = open(nrpspredictoroutputfolder + "nrpscodes.txt","r")
+ nrpspredictorfile1 = nrpspredictorfile1.read()
+ nrpspredictorfile1 = nrpspredictorfile1.replace("\r","\n")
+ lines = nrpspredictorfile1.split("\n")[1:-1]
+ for k in lines:
+ tabs = k.split("\t")
+ nrps_svm_preds[tabs[0]] = tabs[6]
+ nrps_svm_preds_details[tabs[0]] = " NRPSPredictor2 SVM prediction details:
\n8 Angstrom 34 AA code:
\n" + tabs[1] + "
\nPredicted physicochemical class:
\n" + tabs[3] + "
\nLarge clusters prediction:
\n" + tabs[4] + "
\nSmall clusters prediction:
\n" + tabs[5] + "
\nSingle AA prediction:
\n" + tabs[6] + "
\n\n"
+ nrpspredictorfile2 = nrpspredictorfile2.read()
+ nrpspredictorfile2 = nrpspredictorfile2.replace("\r","\n")
+ lines = nrpspredictorfile2.split("\n")[:-1]
+ for k in lines:
+ tabs = k.split("\t")
+ nrps_code_preds[tabs[0]] = tabs[1]
+ nrps_code_preds_details[tabs[0]] = " NRPSPredictor2 Stachelhaus code prediction:
\n" + tabs[1] + "
\n\n"
+ minowa_pks_preds_details = {}
+ minowa_pks_preds = {}
+ pks_code_preds ={}
+ pks_code_preds_details ={}
+ substratetransdict = {'Malonyl-CoA':'mal','Methylmalonyl-CoA':'mmal','Methoxymalonyl-CoA':'mxmal','Ethylmalonyl-CoA':'emal','Isobutyryl-CoA':'isobut','2-Methylbutyryl-CoA':'2metbut','trans-1,2-CPDA':'trans-1,2-CPDA','Acetyl-CoA':'Acetyl-CoA','Benzoyl-_CoA':'benz','Propionyl-CoA':'prop','3-Methylbutyryl-CoA':'3metbut','Ethylmalonyl-CoA':'Ethyl_mal','CE-Malonyl-CoA':'cemal','2-Rhyd-Malonyl-CoA':'2Rhydmal','CHC-CoA':'CHC-CoA','inactive':'inactive'}
+ if len(pksnames) > 0:
+ minowa_at_file = open(minowapksoutputfolder + "pkspredoutput.txt","r")
+ minowa_at_file = minowa_at_file.read()
+ minowa_at_file = minowa_at_file.replace("\r","\n")
+ parts = minowa_at_file.split("\\\\\n")[1:]
+ for i in parts:
+ partlines = i.split("\n")
+ acc = partlines[0]
+ if substratetransdict.has_key(partlines[2].split("\t")[0]):
+ tophit = substratetransdict[partlines[2].split("\t")[0]]
+ else:
+ tophit = "pk"
+ minowa_pks_preds[acc] = tophit
+ minowa_pks_preds_details[acc] = "Minowa HMM method AT-domain
Substrate specificity prediction top hits:
\n" + partlines[1] + "
\n" + partlines[2] + "
\n" + partlines[3] + "
\n" + partlines[4] + "
\n\n"
+ pkssignaturefile = open(pkssignatureoutputfolder + "pkspredoutput.txt","r")
+ pkssignaturefile = pkssignaturefile.read()
+ pkssignaturefile = pkssignaturefile.replace("\r","\n")
+ parts = pkssignaturefile.split("//\n")[1:]
+ for i in parts:
+ partlines = i.split("\n")
+ partlines2 = []
+ for j in partlines:
+ if j != "":
+ partlines2.append(j)
+ partlines = partlines2
+ acc = partlines[0].split("\t")[0]
+ if len(partlines) > 2:
+ tophit = (partlines[1].split("\t")[0]).split("__")[1]
+ pks_code_preds[acc] = tophit
+ codes = []
+ prots = []
+ scores = []
+ for i in partlines[1:4]:
+ codes.append(i.split("\t")[0])
+ prot = i.split("\t")[1]
+ prot = prot.replace("_AT"," (AT")
+ prot = prot.replace("__","): ")
+ prots.append(prot)
+ scores.append(i.split("\t")[2])
+ if len(prots) >= 3:
+ pks_code_preds_details[acc] = "PKS Active Site Signature method
AT-domain substrate specificity prediction top hits:
\nCode:" + partlines[0].split("\t")[1] + "
\n" + codes[0] + " - " + prots[0] + " : (" + scores[0] + "% identity)
\n" + codes[1] + " - " + prots[1] + " : (" + scores[1] + "% identity)
\n" + codes[2] + " - " + prots[2] + " : (" + scores[2] + "% identity)
\n\n"
+ elif len(prots) == 2:
+ pks_code_preds_details[acc] = "PKS Active Site Signature method
AT-domain substrate specificity prediction top hits:
\nCode:" + partlines[0].split("\t")[1] + "
\n" + codes[0] + " - " + prots[0] + " : (" + scores[0] + "% identity)
\n" + codes[1] + " - " + prots[1] + " : (" + scores[1] + "% identity)
\n\n"
+ elif len(prots) == 1:
+ pks_code_preds_details[acc] = "PKS Active Site Signature method
AT-domain substrate specificity prediction top hits:
\nCode:" + partlines[0].split("\t")[1] + "
\n" + codes[0] + " - " + prots[0] + " : (" + scores[0] + "% identity)
\n\n"
+ else:
+ pks_code_preds[acc] = "N/A"
+ pks_code_preds_details[acc] = "PKS Active Site Signature method
No AT-domain substrate specificity prediction hits above 40% identity.
\n\n"
+ minowa_cal_preds = {}
+ minowa_cal_preds_details = {}
+ if len(calnames) > 0:
+ minowa_cal_file = open(minowacaloutputfolder + "calpredoutput.txt","r")
+ minowa_cal_file = minowa_cal_file.read()
+ minowa_cal_file = minowa_cal_file.replace("\r","\n")
+ parts = minowa_cal_file.split("\\\\\n")[1:]
+ for i in parts:
+ partlines = i.split("\n")
+ acc = partlines[0]
+ tophit = partlines[2].split("\t")[0]
+ minowa_cal_preds[acc] = tophit
+ minowa_cal_preds_details[acc] = "Minowa HMM method
CAL-domain substrate specificity prediction top hits:
\n" + partlines[1] + "
\n" + partlines[2] + "
\n" + partlines[3] + "
\n" + partlines[4] + "
\n\n"
+ kr_activity_preds = {}
+ kr_stereo_preds = {}
+ if len(krnames) > 0:
+ krfile = open(kranalysisoutputfolder + "krpredoutput.txt","r")
+ krfile = krfile.read()
+ krfile = krfile.replace("\r","\n")
+ krlines = krfile.split("\n")[:-1]
+ for i in krlines:
+ tabs = i.split("\t")
+ kr_activity_preds[tabs[0]] = tabs[1]
+ kr_stereo_preds[tabs[0]] = tabs[2]
+
+ #Combine substrate specificity predictions into consensus prediction
+ consensuspreds = {}
+ #available_smiles_parts = ['ALA','ARG','ASN','ASP','CYS','GLN','GLU','GLY','HIS','ILE','LEU','LYS','PHE','PRO','SER','THR','TRP','TYR','VAL','MET','ORN','ala','arg','asn','asp','cys','gln','glu','gly','his','ile','leu','lys','phe','pro','ser','thr','trp','tyr','val','met','orn','Ala','Arg','Asn','Asp','Cys','Gln','Glu','Gly','His','Ile','Leu','Lys','Phe','Pro','Ser','Thr','Trp','Tyr','Val','Met','Orn','MPRO','23DHB','34DHB','2HIVA','PGLY','DAB','BALA','AEO','4MHA','PICO','AAA','DHA','SCY','PIP','BMT','ADDS','mpro','23dhb','34dhb','2hiva','pgly','dab','bala','aeo','4mha','pico','aaa','dha','scy','pip','bmt','adds','Mpro','23Dhb','34Dhb','2Hiva','Pgly','Dab','Bala','Aeo','4Mha','Pico','Aaa','Dha','Scy','Pip','Bmt','Adds','mal','mmal','omal','emal','nrp','pk']
+ available_smiles_parts = ['GLY','ALA','VAL','LEU','ILE','MET','PRO','PHE','TRP','SER','THR','ASN','GLN','TYR','CYS','LYS','ARG','HIS','ASP','GLU','MPRO','ORN','PGLY','DAB','BALA','AEO','DHA','PIP','BMT','gly','ala','val','leu','ile','met','pro','phe','trp','ser','thr','asn','gln','tyr','cys','lys','arg','his','asp','glu','aaa','mpro','dhb','2hiva','orn','pgly','dab','bala','aeo','4mha','pico','phg','dha','scy','pip','bmt','adds','aad','abu','hiv','dhpg','bht','3-me-glu','4pPro','ala-b','ala-d','dht','Sal','tcl','lys-b','hpg','hyv-d','iva','vol','mal','mmal','mxmal','emal','nrp','pk','Gly','Ala','Val','Leu','Ile','Met','Pro','Phe','Trp','Ser','Thr','Asn','Gln','Tyr','Cys','Lys','Arg','His','Asp','Glu','Mpro','23Dhb','34Dhb','2Hiva','Orn','Pgly','Dab','Bala','Aeo','4Mha','Pico','Aaa','Dha','Scy','Pip','Bmt','Adds','DHpg','DHB','nrp','pk']
+ for i in pksnrpscoregenes:
+ nra = 0
+ nrat = 0
+ nrcal = 0
+ j = domaindict[i]
+ for k in j:
+ if k[0] == "PKS_AT":
+ nrat += 1
+ preds = []
+ preds.append(minowa_pks_preds[i + "_AT" + str(nrat)])
+ preds.append(pks_code_preds[i + "_AT" + str(nrat)])
+ cpred = "n"
+ for l in preds:
+ if preds.count(l) > 1:
+ if l in available_smiles_parts:
+ consensuspreds[i + "_AT" + str(nrat)] = l
+ else:
+ consensuspreds[i + "_AT" + str(nrat)] = "pk"
+ cpred = "y"
+ if cpred == "n":
+ consensuspreds[i + "_AT" + str(nrat)] = "pk"
+ if k[0] == "AMP-binding" or k[0] == "A-OX":
+ nra +=1
+ preds = []
+ preds.append(minowa_nrps_preds[i + "_A" + str(nra)])
+ preds.append(nrps_svm_preds[i + "_A" + str(nra)])
+ preds.append(nrps_code_preds[i + "_A" + str(nra)])
+ cpred = "n"
+ for l in preds:
+ if preds.count(l) > 1:
+ if l in available_smiles_parts:
+ consensuspreds[i + "_A" + str(nra)] = l
+ else:
+ consensuspreds[i + "_A" + str(nra)] = "nrp"
+ cpred = "y"
+ if cpred == "n":
+ consensuspreds[i + "_A" + str(nra)] = "nrp"
+ if k[0] == "CAL_domain":
+ nrcal += 1
+ if minowa_cal_preds[i + "_CAL" + str(nrcal)] in available_smiles_parts:
+ consensuspreds[i + "_CAL" + str(nrcal)] = minowa_cal_preds[i + "_CAL" + str(nrcal)]
+ else:
+ consensuspreds[i + "_CAL" + str(nrcal)] = "pk"
+
+ #Write all prediction details to HTML files for each gene to be used as pop-up window
+ domainnamesdict = {}
+ for i in pksnrpscoregenes:
+ j = domaindict[i]
+ domainnames = []
+ for k in j:
+ domainnames.append(k[0])
+ domainnamesdict[i] = domainnames
+ for i in pksnrpscoregenes:
+ if "PKS_AT" in domainnamesdict[i] or "AMP-binding" in domainnamesdict[i] or "A-OX" in domainnamesdict[i] or "CAL_domain" in domainnamesdict[i]:
+ j = domaindict[i]
+ nrat = 0
+ nra = 0
+ nrcal = 0
+ nrkr = 0
+ for k in j:
+ if k[0] == "PKS_AT":
+ nrat += 1
+ domainname = i + "_AT" + str(nrat)
+ htmloutfile = open(substrspecsfolder + domainname + ".html","w")
+ htmloutfile.write('\n\nPrediction details\n\n\n')
+ htmloutfile.write(minowa_pks_preds_details[domainname])
+ htmloutfile.write(pks_code_preds_details[domainname])
+ htmloutfile.write("Consensus Predictions: " + consensuspreds[domainname] + "")
+ htmloutfile.write('\n\n')
+ htmloutfile.close()
+ if k[0] == "AMP-binding" or k[0] == "A-OX":
+ nra += 1
+ domainname = i + "_A" + str(nra)
+ htmloutfile = open(substrspecsfolder + domainname + ".html","w")
+ htmloutfile.write('\n\nPrediction details\n\n\n')
+ htmloutfile.write(nrps_svm_preds_details[domainname])
+ htmloutfile.write(nrps_code_preds_details[domainname])
+ htmloutfile.write(minowa_nrps_preds_details[domainname])
+ htmloutfile.write("Consensus Prediction: '" + consensuspreds[domainname] + "'")
+ htmloutfile.write('\n\n')
+ htmloutfile.close()
+ if k[0] == "CAL_domain":
+ nrcal += 1
+ domainname = i + "_CAL" + str(nrcal)
+ htmloutfile = open(substrspecsfolder + domainname + ".html","w")
+ htmloutfile.write('\n\nPrediction details\n\n\n')
+ htmloutfile.write(minowa_cal_preds_details[domainname])
+ htmloutfile.write('\n\n')
+ htmloutfile.close()
+
+ elapsed = (time.time() - starttime)
+ #print "5541Time since start: " + str(elapsed)
+ #Predict biosynthetic gene order in gene cluster using starter domains, thioesterase domains, gene order and docking domains
+ compound_pred_dict = {}
+ dockingdomainanalysis = []
+ nrpspksclusters = []
+ a = 1
+ for i in geneclusters:
+ genecluster = i
+ clustercoregenes = clusterinfo[i][3]
+ clusterpksnrpsgenes = []
+ for j in clustercoregenes:
+ if j in pksnrpscoregenes:
+ clusterpksnrpsgenes.append(j)
+ if len(clusterpksnrpsgenes) > 0:
+ nrpspksclusters.append(genecluster)
+ pksgenes = 0
+ clusterpksgenes = []
+ nrpsgenes = 0
+ clusternrpsgenes = []
+ hybridgenes = 0
+ clusterhybridgenes = []
+ for j in clusterpksnrpsgenes:
+ k = nrpspkstypedict[j]
+ if "PKS" in k and "NRPS" not in k:
+ pksgenes += 1
+ clusterpksgenes.append(j)
+ elif "PKS" not in k and "NRPS" in k:
+ nrpsgenes += 1
+ clusternrpsgenes.append(j)
+ elif "PKS/NRPS" in k:
+ if ("PKS_KS" in domainnamesdict[j] or "PKS_AT" in domainnamesdict[j]) and ("AMP-binding" not in domainnamesdict[j] and "A-OX" not in domainnamesdict[j] and "Condensation" not in domainnamesdict[j]):
+ pksgenes += 1
+ clusterpksgenes.append(j)
+ elif ("PKS_KS" not in domainnamesdict[j] and "PKS_AT" not in domainnamesdict[j]) and ("AMP-binding" in domainnamesdict[j] or "A-OX" in domainnamesdict[j] or "Condensation" in domainnamesdict[j]):
+ nrpsgenes += 1
+ clusternrpsgenes.append(j)
+ elif "PKS" in k and "NRPS" in k:
+ hybridgenes += 1
+ clusterhybridgenes.append(j)
+ #If more than three PKS genes, use dock_dom_analysis if possible to identify order
+ dock_dom_analysis = "failed"
+ if pksgenes > 3 and nrpsgenes == 0 and hybridgenes == 0:
+ #print "Predicting PKS gene order by docking domain sequence analysis"
+ logfile.write("Predicting PKS gene order by docking domain sequence analysis")
+ dockhtmlfile = open(htmlfolder + "docking_analysis" + str(genecluster) + ".html","w")
+ #Find first and last genes based on starter module and TE / TD
+ startergene = ""
+ endinggene = ""
+ for k in clusterpksgenes:
+ if "Thioesterase" in domainnamesdict[k] or "TD" in domainnamesdict[k]:
+ if endinggene == "":
+ endinggene = k
+ else:
+ endinggene = ""
+ if len(domainnamesdict[k]) >=2 and "PKS_AT" == domainnamesdict[k][0] and "ACP" == domainnamesdict[k][1]:
+ if startergene == "":
+ startergene = k
+ else:
+ startergene = ""
+ if startergene == "":
+ for k in clusterpksgenes:
+ if len(domainnamesdict[k]) >=3 and "PKS_KS" == domainnamesdict[k][0] and "PKS_AT" == domainnamesdict[k][1] and "ACP" == domainnamesdict[k][2]:
+ if startergene == "":
+ startergene = k
+ else:
+ startergene = ""
+ break
+ #Extract N-terminal 50 residues of each non-starting protein, scan for docking domains using hmmsearch, parse output to locate interacting residues
+ ntermintresdict = {}
+ ntermnames = []
+ ntermseqs = []
+ for k in clusterpksgenes:
+ if k != startergene:
+ ntermnames.append(k)
+ seq = seqdict[k]
+ ntermseqs.append(seq[:50])
+ ntermfasta = "docking_analysis/input.fasta"
+ z = 0
+ for k in ntermnames:
+ writefasta([ntermnames[z]],[ntermseqs[z]],ntermfasta)
+ os.chdir("docking_analysis")
+ os.system("muscle -profile -quiet -in1 nterm.fasta -in2 input.fasta -out muscle.fasta")
+ intresidues = extractpositions("nterm.fasta","muscle.fasta",[2,15],"EryAIII_5_6_ref",ntermnames[z])
+ ntermintresdict[ntermnames[z]] = intresidues
+ os.chdir("..")
+ z += 1
+ #Extract C-terminal 100 residues of each non-ending protein, scan for docking domains using hmmsearch, parse output to locate interacting residues
+ ctermintresdict = {}
+ ctermnames = []
+ ctermseqs = []
+ for k in clusterpksgenes:
+ if k != endinggene:
+ ctermnames.append(k)
+ seq = seqdict[k]
+ ctermseqs.append(seq[-100:])
+ ctermfasta = "docking_analysis/input.fasta"
+ z = 0
+ for k in ctermnames:
+ writefasta([ctermnames[z]],[ctermseqs[z]],ctermfasta)
+ os.chdir("docking_analysis")
+ os.system("muscle -profile -quiet -in1 cterm.fasta -in2 input.fasta -out muscle.fasta")
+ intresidues = extractpositions("cterm.fasta","muscle.fasta",[55,64],"EryAII_ref",ctermnames[z])
+ ctermintresdict[ctermnames[z]] = intresidues
+ os.chdir("..")
+ z += 1
+ #If docking domains found in all, check for optimal order using interacting residues
+ genes_to_order = []
+ z = 0
+ for k in clusterpksgenes:
+ if k == startergene or k == endinggene:
+ pass
+ else:
+ genes_to_order.append(k)
+ z += 1
+ possible_orders = list(itertools.permutations(genes_to_order,len(genes_to_order)))
+ hydrophobic = ["A","V","I","L","F","W","Y","M"]
+ positivecharge = ["H","K","R"]
+ negativecharge = ["D","E"]
+ other = ["C","G","P","S","T","N","Q","X","U"]
+ possible_orders_scoredict = {}
+ for k in possible_orders:
+ score = 0
+ interactions = []
+ z = 0
+ for l in k[:-1]:
+ interactions.append([l,k[z + 1]])
+ z += 1
+ for l in interactions:
+ res1a = ctermintresdict[l[0]][0]
+ res1b = ntermintresdict[l[1]][0]
+ res2a = ctermintresdict[l[0]][1]
+ res2b = ntermintresdict[l[1]][1]
+ if (res1a in hydrophobic and res1b in hydrophobic) or (res1a in positivecharge and res1b in negativecharge) or (res1a in negativecharge and res1b in positivecharge):
+ score += 1
+ if (res1a in positivecharge and res1b in positivecharge) or (res1a in negativecharge and res1b in negativecharge):
+ score = score - 1
+ if (res2a in hydrophobic and res2b in hydrophobic) or (res2a in positivecharge and res2b in negativecharge) or (res2a in negativecharge and res2b in positivecharge):
+ score += 1
+ if (res2a in positivecharge and res2b in positivecharge) or (res2a in negativecharge and res2b in negativecharge):
+ score = score - 1
+ possible_orders_scoredict[k] = score
+ ranked_orders = sortdictkeysbyvaluesrev(possible_orders_scoredict)
+ ranked_orders_part = []
+ ranked_orders2 = []
+ a = 0
+ ranked_orders_len = len(ranked_orders) - 1
+ for i in ranked_orders:
+ if a == 0:
+ score = possible_orders_scoredict[i]
+ ranked_orders_part.append(i)
+ elif a == ranked_orders_len:
+ ranked_orders_part.append(i)
+ ranked_orders2 = ranked_orders2 + ranked_orders_part
+ else:
+ if possible_orders_scoredict[i] == score:
+ ranked_orders_part.append(i)
+ else:
+ ranked_orders_part.reverse()
+ ranked_orders2 = ranked_orders2 + ranked_orders_part
+ score = possible_orders_scoredict[i]
+ ranked_orders_part = []
+ ranked_orders_part.append(i)
+ a += 1
+ ranked_orders = ranked_orders2[:1000]
+ geneorders = ranked_orders
+ geneorders2 = []
+ for l in geneorders:
+ geneorder = []
+ if startergene != "":
+ geneorder.append(startergene)
+ [ geneorder.append(m) for m in l ]
+ #for m in l:
+ # geneorder.append(m)
+ if endinggene != "":
+ geneorder.append(endinggene)
+ geneorders2.append(geneorder)
+ geneorders = geneorders2
+ if len(ranked_orders) == 1000:
+ dockhtmlfile.write('\n\n\n\n\nDocking domain analysis. Score for 1000 highest scoring gene orders:
\n')
+ else:
+ dockhtmlfile.write('\n\n\n\n\nDocking domain analysis. Scores for all possible gene orders:
\n')
+ dockhtmlfile.write('Gene order | Score |
\n')
+ for l in geneorders:
+ string = ""
+ for m in l:
+ string = string + m + ","
+ if startergene != "" and endinggene != "":
+ string = string[:-1] + " | " + str(possible_orders_scoredict[tuple(l[1:-1])])
+ elif startergene == "" and endinggene != "":
+ string = string[:-1] + " | " + str(possible_orders_scoredict[tuple(l[:-1])])
+ elif startergene != "" and endinggene == "":
+ string = string[:-1] + " | " + str(possible_orders_scoredict[tuple(l[1:])])
+ elif startergene == "" and endinggene == "":
+ string = string[:-1] + " | " + str(possible_orders_scoredict[tuple(l)])
+ dockhtmlfile.write(string + " |
\n")
+ dockhtmlfile.write('\n
')
+ dockhtmlfile.close()
+ #print "Predicting PKS gene order by docking domain sequence analysis succeeded."
+ #Write html outfile with docking domain analysis output
+ #
+ logfile.write("Predicting PKS gene order by docking domain sequence analysis succeeded.")
+ dockingdomainanalysis.append(genecluster)
+ #If NRPS genes, mixed NRPS/PKS genes, PKS genes without detected docking domains, or clusters with a 1-3 PKS genes, assume colinearity
+ direction = 0
+ for k in clusterpksnrpsgenes:
+ if strandsdict[k] == "+":
+ direction += 1
+ elif strandsdict[k] == "-":
+ direction = direction - 1
+ if direction < 0:
+ clusterpksnrpsgenes.reverse()
+ if "Thioesterase" in domainnamesdict[clusterpksnrpsgenes[0]] or "TD" in domainnamesdict[clusterpksnrpsgenes[0]]:
+ clusterpksnrpsgenes.reverse()
+ geneorder = clusterpksnrpsgenes
+ #Generate substrates order from predicted gene order and consensus predictions
+ prediction = ""
+ for k in geneorder:
+ domains = domainnamesdict[k]
+ nra = 0
+ nrat = 0
+ nrcal = 0
+ for l in domains:
+ if "PKS_AT" in l:
+ nrat += 1
+ prediction = prediction + consensuspreds[k + "_AT" + str(nrat)] + " "
+ if "AMP-binding" in l or "A-OX" in l:
+ nra += 1
+ prediction = prediction + consensuspreds[k + "_A" + str(nra)] + " "
+ if "CAL_domain" in l:
+ nrcal += 1
+ prediction = prediction + consensuspreds[k + "_CAL" + str(nrcal)] + " "
+ prediction = prediction[:-1]
+ compound_pred_dict[genecluster] = prediction
+ a += 1
+
+ #Combine predictions into a prediction of the final chemical structure and generate images
+ os.chdir("NRPeditor")
+ failedstructures = []
+ for i in geneclusters:
+ genecluster = i
+ if compound_pred_dict.has_key(genecluster):
+ residues = compound_pred_dict[genecluster]
+ nrresidues = len(residues.split(" "))
+ if nrresidues > 1:
+ if sys.platform == ('win32'):
+ structcommand = 'main input 100 4000 1000 AA DDV DIM ' + str(nrresidues + 1) + ' "'
+ elif sys.platform == ('linux2'):
+ structcommand = './main input 100 4000 1000 AA DDV DIM ' + str(nrresidues + 1) + ' "'
+ for i in residues.split(" "):
+ structcommand = structcommand + i + " "
+ structcommand = structcommand + 'TE"'
+ smilesinfo = os.popen(structcommand)
+ smilesinfo = smilesinfo.read()
+ smiles_string = (smilesinfo.split("core peptide: ")[1]).split("\ntermintype")[0]
+ if sys.platform == ('linux2'):
+ smiles_string.replace("[X]","[*:X]")
+ smiles_string2 = ""
+ a = 1
+ for k in smiles_string:
+ if k == "X":
+ smiles_string2 = smiles_string2 + str(a)
+ a += 1
+ else:
+ smiles_string2 = smiles_string2 + k
+ smiles_string = smiles_string2
+ smilesfile = open("genecluster" + str(genecluster) + ".smi","w")
+ smilesfile.write(smiles_string)
+ smilesfile.close()
+ depictstatus = depict_smile(genecluster,structuresfolder)
+ if depictstatus == "failed":
+ failedstructures.append(genecluster)
+ elif clusterinfo[genecluster][0] == "ectoine":
+ smiles_string = "CC1=NCCC(N1)C(=O)O"
+ smilesfile = open("genecluster" + str(genecluster) + ".smi","w")
+ smilesfile.write(smiles_string)
+ smilesfile.close()
+ depictstatus = depict_smile(genecluster,structuresfolder)
+ if depictstatus == "failed":
+ failedstructures.append(genecluster)
+ elif genecluster in failedstructures:
+ del failedstructures[failedstructures.index(genecluster)]
+ compound_pred_dict[genecluster] = "ectoine "
+ os.chdir("..")
+
+ elapsed = (time.time() - starttime)
+ #print "5826 Time since start: " + str(elapsed)
+ #ClusterBlast
+ if clusterblast == "y":
+ #Load gene cluster database into memory
+ #print "ClusterBlast: Loading gene clusters database into memory..."
+ logfile.write("ClusterBlast: Loading gene clusters database into memory...\n")
+
+ os.chdir(genomename + "/clusterblast")
+ #file = open( os.path.join(antismash_path, "clusterblast/geneclusters.txt") ,"r")
+ #filetext = file.read()
+ #lines = filetext.split("\n")
+ clusters = {}
+ #for i in open(os.path.join(antismash_path, "clusterblast/geneclusters.txt")):
+ bin_path = os.path.join(antismash_path, "clusterblast/geneclusters.bin")
+ if os.path.exists( bin_path ):
+ clusters = cPickle.load( open(bin_path) )
+ #print clusters
+ else:
+ for line in open( os.path.join(antismash_path, "clusterblast/geneclusters.txt") ,"r"):
+ line = line.strip()
+ tabs = line.split("\t")
+ accession = tabs[0]
+ clusterdescription = tabs[1]
+ clusternr = tabs[2]
+ clustertype = tabs[3]
+ clustername = accession + "_" + clusternr
+ clustertags = tabs[4].split(";")
+ clusterprots = tabs[5].split(";")
+ clusters[clustername] = [clusterprots,clusterdescription,clustertype,clustertags]
+ cPickle.dump(clusters, open(bin_path, 'w'), -1)
+ #Load gene cluster database proteins info into memory
+ #print "ClusterBlast: Loading gene cluster database proteins into memory..."
+ logfile.write("ClusterBlast: Loading gene cluster database proteins into memory...\n")
+ #file = open( os.path.join(antismash_path, "clusterblast/geneclusterprots.fasta") ,"r")
+ #filetext = file.read()
+ #filetext = filetext.replace("\r","\n")
+ #lines = filetext.split("\n")
+ proteingeneclusters = {}
+ proteinlocations = {}
+ proteinstrands = {}
+ proteinannotations = {}
+ proteintags = {}
+ bin_path = os.path.join(antismash_path, "clusterblast/geneclusterprots.fasta.bin")
+ if os.path.exists( bin_path ):
+ (proteingeneclusters, proteinlocations, proteinstrands, proteinannotations, proteintags) = cPickle.load( open(bin_path, 'r') )
+ else:
+ for line in open( os.path.join(antismash_path, "clusterblast/geneclusterprots.fasta") ,"r"):
+ line = line.replace('\n', '')
+ if line.startswith(">"):
+ tabs = line.split("|")
+ #print 'Protein:', tabs
+ protein = tabs[6]
+ locustag = tabs[4]
+ if accessiondict.has_key(locustag):
+ locustag = "h_" + locustag
+ proteintags[protein] = locustag
+ clustername = tabs[0] + "_" + tabs[1]
+ proteingeneclusters[protein] = clustername
+ location = tabs[2]
+ proteinlocations[protein] = location
+ strand = tabs[3]
+ proteinstrands[protein] = strand
+ annotation = tabs[5]
+ proteinannotations[protein] = annotation
+ cPickle.dump([proteingeneclusters, proteinlocations, proteinstrands, proteinannotations, proteintags], open(bin_path, 'w'), -1)
+ #Run BLAST on gene cluster proteins of each cluster on itself to find internal homologs, store groups of homologs - including singles - in a dictionary as a list of lists accordingly
+ #print "Finding internal homologs in each gene cluster.."
+ logfile.write("Finding internal homologs in each gene cluster..\n")
+ internalhomologygroupsdict = {}
+ for i in geneclusters:
+ clusternumber = i
+ #Create input fasta files for BLAST search
+ queryclusterprotslist = clusterinfo[i][4]
+ queryclusterprots = []
+ for i in queryclusterprotslist:
+ queryclusterprots.append(i[4])
+ queryclusternames = []
+ queryclusterseqs = []
+ for i in queryclusterprots:
+ seq = seqdict[i]
+ name = fullnamedict[i]
+ queryclusterseqs.append(seq)
+ queryclusternames.append(name)
+ writefasta(queryclusternames,queryclusterseqs,"internal_input.fasta")
+ #Run and parse BLAST search
+ makeblastdbcommand = "makeblastdb -in internal_input.fasta -out internal_input.fasta -dbtype prot"
+ blastsearch = "blastp -db internal_input.fasta -query internal_input.fasta -outfmt 6 -max_target_seqs 1000 -evalue 1e-05 -out internal_input.out"
+ if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y":
+ os.popen(makeblastdbcommand)
+ os.popen(blastsearch)
+ else:
+ os.system(makeblastdbcommand)
+ os.system(blastsearch)
+ #print "5920 makeblastdb finised"
+ blastoutput = open("internal_input.out","r").read()
+ minseqcoverage = 25
+ minpercidentity = 30
+ seqlengths = fastaseqlengths(proteins)
+ iblastinfo = blastparse(blastoutput,minseqcoverage,minpercidentity,seqlengths,geneclustergenes)
+ iblastdict = iblastinfo[0]
+ iquerylist = iblastinfo[1]
+ #find and store internal homologs
+ groups = []
+ for j in queryclusternames:
+ jsplit = j.split("|")[4]
+ if iblastdict.has_key(j):
+ hits = iblastdict[j][0]
+ group = []
+ for k in hits:
+ if k[:2] == "h_":
+ group.append(k[2:])
+ elif k.count("|") > 4:
+ group.append(k.split("|")[4])
+ else:
+ group.append(k)
+ if jsplit not in group:
+ group.append( jsplit )
+ x = 0
+ for l in groups:
+ for m in group:
+ if m in l:
+ del groups[x]
+ [group.append(n) for n in l if n not in group]
+ #for n in l:
+ # if n not in group:
+ # group.append(n)
+ break
+ x += 1
+ group.sort()
+ groups.append(group)
+ else:
+ groups.append([ jsplit ])
+ internalhomologygroupsdict[clusternumber] = groups
+
+ #Run BLAST on gene cluster proteins of each cluster and parse output
+ #print "5961 Running NCBI BLAST+ gene cluster searches.."
+ logfile.write("Running NCBI BLAST+ gene cluster searches..\n")
+ for i in geneclusters:
+ clusternumber = i
+ #print " Gene cluster " + str(clusternumber)
+ #Create input fasta files for BLAST search
+ queryclusterprotslist = clusterinfo[i][4]
+ queryclusterprots = []
+ for i in queryclusterprotslist:
+ queryclusterprots.append(i[4])
+ queryclusternames = []
+ queryclusterseqs = []
+ for i in queryclusterprots:
+ seq = seqdict[i]
+ name = fullnamedict[i]
+ queryclusterseqs.append(seq)
+ queryclusternames.append(name)
+ equalpartsizes = int(len(queryclusternames)/nrcpus)
+ for i in range(nrcpus):
+ if i == 0:
+ setnames = queryclusternames[:equalpartsizes]
+ setseqs = queryclusterseqs[:equalpartsizes]
+ elif i == (nrcpus - 1):
+ setnames = queryclusternames[(i*equalpartsizes):]
+ setseqs = queryclusterseqs[(i*equalpartsizes):]
+ else:
+ setnames = queryclusternames[(i*equalpartsizes):((i+1)*equalpartsizes)]
+ setseqs = queryclusterseqs[(i*equalpartsizes):((i+1)*equalpartsizes)]
+ writefasta(setnames,setseqs,"input" + str(i) + ".fasta")
+ processes = []
+ processnames = []
+ for i in range(nrcpus):
+ processes.append(Process(target=runblast, args=["input" + str(i) + ".fasta"]))
+ [i.start() for i in processes]
+ time.sleep(10)
+ while True:
+ processrunning = "n"
+ for i in processes:
+ if i.is_alive():
+ processrunning = "y"
+ if processrunning == "y":
+ time.sleep(5)
+ else:
+ break
+ [i.join() for i in processes]
+ blastoutput = ""
+ for i in range(nrcpus):
+ output = open("input" + str(i) + ".out","r")
+ output = output.read()
+ blastoutput = blastoutput + output
+ os.chdir("..")
+ blastoutputfile = open("./clusterblastoutput.txt","w")
+ blastoutputfile.write(blastoutput)
+ blastoutputfile.close()
+ os.chdir("clusterblast")
+ #print " Blast search finished. Parsing results..."
+ logfile.write(" Blast search finished. Parsing results...\n")
+ minseqcoverage = 25
+ minpercidentity = 30
+ seqlengths = fastaseqlengths(proteins)
+ blastinfo = blastparse(blastoutput,minseqcoverage,minpercidentity,seqlengths,geneclustergenes)
+ blastdict = blastinfo[0]
+ querylist = blastinfo[1]
+ #Remove queries without hits
+ querylist2 = []
+ for i in querylist:
+ if blastdict.has_key(i):
+ querylist2.append(i)
+ else:
+ pass
+ querylist = querylist2
+ hitclusters = blastinfo[2]
+ #Score BLAST output on all gene clusters
+ #Rank gene cluster hits based on 1) number of protein hits covering >25% sequence length or at least 100aa alignment, with >30% identity and 2) cumulative blast score
+ #Find number of protein hits and cumulative blast score for each gene cluster
+ #print " Scoring Blast outputs on database of gene clusters..."
+ logfile.write(" Scoring Blast outputs on database of gene clusters...\n")
+ hitclusterdict = {}
+ hitclusterdata = {}
+ for i in hitclusters:
+ hitclusterdatalist = []
+ nrhits = float(0)
+ nrcoregenehits = float(0)
+ cumblastscore = float(0)
+ hitpositions = []
+ hitposcorelist = []
+ for j in querylist:
+ querynrhits = 0
+ querycumblastscore = float(0)
+ nrhitsplus = "n"
+ for k in blastdict[j][0]:
+ if i == blastdict[j][1][k][0]:
+ if [querylist.index(j),clusters[i][0].index(blastdict[j][1][k][9])] not in hitpositions:
+ nrhitsplus = "y"
+ querynrhits += 1
+ blastscore = float(blastdict[j][1][k][6]) / 1000000
+ querycumblastscore = querycumblastscore + blastscore
+ hitclusterdatalist.append([j,k,blastdict[j][1][k][5],blastdict[j][1][k][6],blastdict[j][1][k][7],blastdict[j][1][k][8]])
+ hitclusterdata[i] = hitclusterdatalist
+ hitpositions.append([querylist.index(j),clusters[i][0].index(blastdict[j][1][k][9])])
+ if nrhitsplus == "y":
+ nrhits += 1
+ if j.split("|")[4] in allcoregenes:
+ nrcoregenehits += 0.1
+ for hit in range(querynrhits):
+ hitposcorelist.append(1)
+ else:
+ for hit in range(querynrhits):
+ hitposcorelist.append(0)
+ cumblastscore = cumblastscore + float(querycumblastscore)
+ query_givenscores_querydict = {}
+ query_givenscores_hitdict = {}
+ #Find groups of hits
+ hitgroupsdict = {}
+ for p in hitpositions:
+ if not hitgroupsdict.has_key(p[0]):
+ hitgroupsdict[p[0]] = [p[1]]
+ else:
+ hitgroupsdict[p[0]].append(p[1])
+ #Calculate synteny score; give score only if more than one hits (otherwise no synteny possible), and only once for every query gene and every hit gene
+ synteny_score = 0
+ z = 1
+ if nrhits > 1:
+ for p in hitpositions[:-1]:
+ tandem = "n"
+ #Check if a gene homologous to this gene has already been scored for synteny in the previous entry
+ if p[1] in hitgroupsdict[hitpositions[z][0]]:
+ tandem = "y"
+ #Score entry
+ if ((not query_givenscores_querydict.has_key(p[0])) or query_givenscores_querydict[p[0]] == 0) and ((not query_givenscores_hitdict.has_key(p[1])) or query_givenscores_hitdict[p[1]] == 0) and tandem == "n":
+ q = hitpositions[z]
+ if (abs(p[0] - q[0]) < 2) and abs(p[0]-q[0]) == abs(p[1]-q[1]):
+ synteny_score += 1
+ if hitposcorelist[z - 1] == 1 or hitposcorelist[z] == 1:
+ synteny_score += 1
+ query_givenscores_querydict[p[0]] = 1
+ query_givenscores_hitdict[p[1]] = 1
+ else:
+ query_givenscores_querydict[p[0]] = 0
+ query_givenscores_hitdict[p[1]] = 0
+ z += 1
+ #Give bonus to gene clusters with >0 core gene hits
+ if nrcoregenehits > 0:
+ corebonus = 3
+ else:
+ corebonus = 0
+ #sorting score is based on number of hits (discrete values) & cumulative blast score (behind comma values)
+ sortingscore = nrhits + synteny_score + corebonus + nrcoregenehits + cumblastscore
+ hitclusterdict[i] = sortingscore
+ #Sort gene clusters
+ rankedclusters = sortdictkeysbyvaluesrev(hitclusterdict)
+ rankedclustervalues = sortdictkeysbyvaluesrevv(hitclusterdict)
+ #Output for each hit: table of genes and locations of input cluster, table of genes and locations of hit cluster, table of hits between the clusters
+ #print " Writing output file..."
+ logfile.write(" Writing output file...\n")
+ #os.chdir("..")
+ #os.chdir(genomename)
+ #os.chdir("clusterblast")
+ out_file = open("cluster" + str(clusternumber) + ".txt","w")
+ out_file.write("ClusterBlast scores for " + infile)
+ out_file.write("\n\nTable of genes, locations, strands and annotations of query cluster:\n")
+ #out_file.write("\n")
+ #out_file.write("Table of genes, locations, strands and annotations of query cluster:")
+ #out_file.write("\n")
+ for i in queryclusterprots:
+ out_file.write("%s\t%s\t%s\t%s\t%s\t\n" % (i, proteins[3][i][0], proteins[3][i][1], proteins[3][i][2], proteins[3][i][3]))
+ """out_file.write(i)
+ out_file.write("\t")
+ out_file.write(proteins[3][i][0])
+ out_file.write("\t")
+ out_file.write(proteins[3][i][1])
+ out_file.write("\t")
+ out_file.write(proteins[3][i][2])
+ out_file.write("\t")
+ out_file.write(proteins[3][i][3])
+ out_file.write("\t")
+ out_file.write("\n")"""
+ out_file.write("\n\nSignificant hits: \n")
+ #out_file.write("\n")
+ #out_file.write("Significant hits: ")
+ #out_file.write("\n")
+ z = 0
+ for i in rankedclusters[:100]:
+ #out_file.write(str(z+1) + ". " + i + "\t" + clusters[i][1])
+ #out_file.write("\n")
+ out_file.write("%s. %s\t%s\n" % ((z+1), i, clusters[i][1]) )
+ z += 1
+ out_file.write("\n\n")
+ #out_file.write("\n")
+ z = 0
+ out_file.write("Details:")
+ for i in rankedclusters[:100]:
+ value = str(rankedclustervalues[z])
+ nrhits = value.split(".",1)[0]
+ if nrhits > 0:
+ cumblastscore = str(int(float(value.split(".")[1])))
+ out_file.write("\n\n>>\n\n%s. %s\nSource: %s\nType: %s\nNumber of proteins with BLAST hits to this cluster: %s\nCumulative BLAST score: %s\n\nTable of genes, locations, strands and annotations of subject cluster:\n" % (z+1, i, clusters[i][1], clusters[i][2], nrhits, cumblastscore))
+ clusterproteins = clusters[i][0]
+ #print 'clusterproteins\n\n', clusterproteins
+ """out_file.write("\n\n")
+ out_file.write(">>")
+ out_file.write("\n")
+ cumblastscore = str(int(float(value.split(".")[1])))
+ out_file.write("\n")
+ out_file.write(str(z+1) + ". " + i)
+ out_file.write("\n")
+ out_file.write("Source: " + clusters[i][1])
+ out_file.write("\n")
+ out_file.write("Type: " + clusters[i][2])
+ out_file.write("\n")
+ out_file.write("Number of proteins with BLAST hits to this cluster: " + nrhits)
+ out_file.write("\n")
+ out_file.write("Cumulative BLAST score: " + cumblastscore)
+ out_file.write("\n")
+ out_file.write("\n")
+ out_file.write("Table of genes, locations, strands and annotations of subject cluster:")
+ out_file.write("\n")
+ clusterproteins = clusters[i][0]"""
+
+ for j in clusterproteins:
+ #print '##########asdfasdf######', j, '---'+proteinlocations.keys()[0]+ '---', proteinannotations.has_key(j), proteinstrands.has_key(j), proteinlocations.has_key(j)
+ if proteinlocations.has_key(j) and proteinannotations.has_key(j) and proteinstrands.has_key(j):
+ if proteintags[j] == "no_locus_tag":
+ out_file.write(j)
+ else:
+ out_file.write(proteintags[j])
+ out_file.write( "\t%s\t%s\t%s\t%s\t%s\n" % (j, proteinlocations[j].split("-")[0], proteinlocations[j].split("-")[1], proteinstrands[j], proteinannotations[j]) )
+ """out_file.write("\t")
+ out_file.write(j)
+ out_file.write("\t")
+ out_file.write(proteinlocations[j].split("-")[0])
+ out_file.write("\t")
+ out_file.write(proteinlocations[j].split("-")[1])
+ out_file.write("\t")
+ out_file.write(proteinstrands[j])
+ out_file.write("\t")
+ out_file.write(proteinannotations[j])
+ out_file.write("\n")
+ """
+
+ out_file.write("\nTable of Blast hits (query gene, subject gene, %identity, blast score, %coverage, e-value):\n")
+ if i in hitclusterdata.keys():
+ tabledata = hitclusterdata[i]
+ for x in tabledata:
+ w = 0
+ for y in x:
+ if w == 0:
+ out_file.write( "%s\t" % y.split("|")[4] )
+ #out_file.write("\t")
+ w += 1
+ else:
+ out_file.write("%s\t" % y)
+ #out_file.write("\t")
+ out_file.write("\n")
+ else:
+ "data not found"
+ out_file.write("\n")
+ out_file.write("\n")
+ z += 1
+ #os.chdir("..")
+ #os.chdir("..")
+ #os.chdir("clusterblast")
+ os.chdir("..")
+ out_file.close()
+
+ elapsed = (time.time() - starttime)
+ #print "Time since start: " + str(elapsed)
+ #smCOG analysis
+ smcogtreedict = {}
+ if smcogs == "y":
+ #print "Performing smCOG analysis"
+ logfile.write("Performing smCOG analysis\n")
+ hmmsearch = hmmscan_path + " --cpu " + str(nrcpus) + " -E 1E-6 -o " + "./smcogs/smcogshmm_output.txt" + " --noali --tblout " + "./smcogs/smcogshmm.txt "+ hmms_path +"smcogs.hmm " + "./clusterblast/geneclusterprots.fasta"
+ #print hmmsearch
+ os.system(hmmsearch)
+ #print 'finised'
+ smcoghmmlengthsdict = hmmlengths(hmms_path+"smcogs.hmm")
+ smcogdict = hmmscanparse("./smcogs/smcogshmm_output.txt", smcoghmmlengthsdict)
+ smcogdict2 = {}
+ for i in smcogdict.keys():
+ newkey = i.split("|")[4]
+ smcogdict2[newkey] = smcogdict[i]
+ smcogdict = smcogdict2
+ #Write output
+ #os.chdir(genomename)
+ os.chdir("smcogs")
+ smcogfile = open("smcogs.txt","w")
+ for k in geneclustergenes:
+ if k not in pksnrpscoregenes:
+ l = smcogdict[k]
+ smcogfile.write(">> " + k + "\n")
+ smcogfile.write("name\tstart\tend\te-value\tscore\n")
+ smcogfile.write("** smCOG hits **\n")
+ for i in l:
+ smcogfile.write(str(i[0]) + "\t" + str(i[1]) + "\t" + str(i[2]) + "\t" + str(i[3]) + "\t" + str(i[4]) + "\n")
+ smcogfile.write("\n\n")
+ smcogfile.close()
+ os.chdir("..")
+ os.chdir("..")
+ #smCOG phylogenetic tree construction
+ #print "Calculating and drawing phylogenetic trees of cluster genes with smCOG members"
+ logfile.write("Calculating and drawing phylogenetic trees of cluster genes with smCOG members")
+ os.chdir("smcogtree")
+ smcoganalysisgenes = []
+ #for k in geneclustergenes:
+ # if k not in pksnrpscoregenes:
+ # smcoganalysisgenes.append(k)
+ [smcoganalysisgenes.append(k) for k in geneclustergenes if k not in pksnrpscoregenes]
+ smcogsets = []
+ equalpartsizes = int(len(smcoganalysisgenes)/nrcpus)
+ for i in range(nrcpus):
+ if i == 0:
+ geneslist = smcoganalysisgenes[:equalpartsizes]
+ elif i == (nrcpus - 1):
+ geneslist = smcoganalysisgenes[(i*equalpartsizes):]
+ else:
+ geneslist = smcoganalysisgenes[(i*equalpartsizes):((i+1)*equalpartsizes)]
+ smcogsets.append(geneslist)
+ processes = []
+ processnames = []
+ z = 0
+ for k in smcogsets:
+ processes.append(Process(target=smcog_analysis, args=[k,z,accessiondict,seqdict,smcogdict,smcogsoutputfolder]))
+ z += 1
+ for k in processes:
+ k.start()
+ time.sleep(1)
+ while True:
+ processrunning = "n"
+ for k in processes:
+ if k.is_alive():
+ processrunning = "y"
+ if processrunning == "y":
+ time.sleep(5)
+ else:
+ break
+ for k in processes:
+ k.join()
+ os.chdir("..")
+ currentpath = os.getcwd()
+ os.chdir(smcogsoutputfolder)
+ dircontents = getdircontents()
+ for k in dircontents:
+ #POTENTIAL pERFORMANCE gainfor k in glob.glob('*.png'):
+ if ".png" in k:
+ tag = k.split(".png")[0]
+ smcogtreedict[tag] = tag + ".png"
+ os.chdir(currentpath)
+
+
+ ##Visualization
+ #Read in ClusterBlast data
+ #Read in PubMed / PubChem links of database gene clusters
+ if clusterblast == "y":
+ if genomename in os.getcwd():
+ os.chdir('..')
+ pubmed_dict = {}
+ pubchem_dict = {}
+ known_compound_dict = {}
+ #pubfile = open(antismash_path + "pubmed_pubchem_links.txt","r")
+ #pubfile = pubfile.read()
+ #publines = pubfile.split("\n")
+ #for i in publines:
+ bin_path = os.path.join(antismash_path, "pubmed_pubchem_links.bin")
+ if os.path.exists( bin_path ):
+ (pubmed_dict, pubchem_dict, known_compound_dict) = cPickle.load( open(bin_path) )
+ else:
+ for line in open(antismash_path + "pubmed_pubchem_links.txt","r"):
+ line = line.replace('\n', '')
+ tabs = line.split("\t")
+ acc = tabs[0]
+ if tabs[1] != "":
+ pubmed_dict[acc] = tabs[1]
+ if tabs[2] != "":
+ pubchem_dict[acc] = tabs[2]
+ if tabs[3] != "":
+ known_compound_dict[acc] = tabs[3]
+ cPickle.dump([pubmed_dict, pubchem_dict, known_compound_dict], open(bin_path, 'w'), -1)
+ #print "Writing visualization SVGs and XHTML"
+ logfile.write("Writing visualization SVGs and XHTML\n")
+ queryclusterdata = {}
+ nrhitgeneclusters = {}
+ cblastclusternr = 1
+ #print os.getcwd()
+ if clusterblast == "y":
+ for x in geneclusters:
+ clusterblastfile = open(clusterblastoutputfolder + "cluster" + str(x) + ".txt","r")
+ #print clusterblastfile
+ clusterblastfile = clusterblastfile.read()
+ clusterblastfile = clusterblastfile.replace("\r","\n")
+ toptenhitclusters = []
+ #Identify top ten hits for visualization
+ hitlines = ((clusterblastfile.split("Significant hits: \n")[1]).split("\nDetails:")[0]).split("\n")
+ #print '\n\n#######hitlines\n', hitlines
+ a = 0
+ cb_accessiondict = {}
+ b = 1
+ for i in hitlines:
+ if " " in i:
+ cb_accessiondict[b] = (i.split("\t")[0]).split(" ")[1]
+ if genomic_accnr == "" or genomic_accnr not in i:
+ b += 1
+ if a < 10:
+ if len(i) < 80:
+ toptenhitclusters.append(i)
+ elif len(i) >= 80:
+ j = i[0:77] + "..."
+ toptenhitclusters.append(j)
+ a += 1
+ #print clusterblastfile
+ details = (clusterblastfile.split("\nDetails:")[1]).split(">>")[1:]
+ #print details
+ nrhitclusters = len(toptenhitclusters)
+ #Save query gene cluster data
+ querylines = ((clusterblastfile.split("Table of genes, locations, strands and annotations of query cluster:\n")[1]).split("\n\n\nSignificant hits:")[0]).split("\n")
+ queryclustergenes = []
+ queryclustergenesdetails = {}
+ for i in querylines:
+ tabs = i.split("\t")
+ queryclustergenes.append(tabs[0])
+ queryclustergenesdetails[tabs[0]] = [tabs[1],tabs[2],tabs[3],tabs[4]]
+ #For every gene cluster, store hit genes and details
+ colorgroupsdict = {}
+ hitclusterdata = {}
+ hitclusternr = 1
+ compound_found = "n"
+ nrhitgeneclusters[x] = 0
+ for i in details:
+ hitclustergenes = []
+ hitclustergenesdetails = {}
+ #Only calculate for first ten hit gene clusters
+ if genomic_accnr == "" or genomic_accnr not in i:
+ if hitclusternr <= 10:
+ nrhitgeneclusters[x] = hitclusternr
+ accession = cb_accessiondict[hitclusternr]
+ hitclustergeneslines = ((i.split("Table of genes, locations, strands and annotations of subject cluster:\n")[1]).split("\n\nTable of Blast hits ")[0]).split("\n")
+ #print '***********\n', i, '\n'
+ #print hitclustergeneslines
+ for j in hitclustergeneslines:
+ tabs = j.split("\t")
+ hitclustergenes.append(tabs[0])
+ hitclustergenesdetails[tabs[0]] = [tabs[2],tabs[3],tabs[4],tabs[5],tabs[1]]
+
+ blasthitslines = ((i.split("%coverage, e-value):\n")[1]).split("\n\n")[0]).split("\n")
+ querygeneswithhits = []
+ coregeneswithhits = []
+
+
+ blasthitdict = {}
+ blastdetailsdict = {}
+ querygenes = []
+ revblasthitdict = {}
+ hitgenes = []
+
+
+ for k in blasthitslines:
+ tabs = k.split("\t")
+ if tabs[0] not in querygeneswithhits:
+ querygeneswithhits.append(tabs[0])
+ if tabs[0] in allcoregenes and tabs[0] not in coregeneswithhits:
+ coregeneswithhits.append(tabs[0])
+
+
+ if blasthitdict.has_key(tabs[0]):
+ hits = blasthitdict[tabs[0]]
+ hits.append(tabs[1])
+ blasthitdict[tabs[0]] = hits
+ if revblasthitdict.has_key(tabs[1]):
+ revhits = revblasthitdict[tabs[1]]
+ revhits.append(tabs[0])
+ revblasthitdict[tabs[1]] = revhits
+ else:
+ revblasthitdict[tabs[1]] = [tabs[0]]
+ blastdetailsdict[tabs[0] + "_|_|_" + tabs[1]] = [tabs[5],tabs[3]]
+ if tabs[0] not in querygenes:
+ querygenes.append(tabs[0])
+ hitgenes.append(tabs[1])
+ else:
+ blasthitdict[tabs[0]] = [tabs[1]]
+ if revblasthitdict.has_key(tabs[1]):
+ revhits = revblasthitdict[tabs[1]]
+ revhits.append(tabs[0])
+ revblasthitdict[tabs[1]] = revhits
+ else:
+ revblasthitdict[tabs[1]] = [tabs[0]]
+ blastdetailsdict[tabs[0] + "_|_|_" + tabs[1]] = [tabs[5],tabs[3]]
+ if tabs[0] not in querygenes:
+ querygenes.append(tabs[0])
+ hitgenes.append(tabs[1])
+
+
+
+ for k in known_compound_dict.keys():
+ if k in i and compound_found == "n" and len(querygeneswithhits) > 2 and len(coregeneswithhits) > 0:
+ ws0.write(x,4,known_compound_dict[k])
+ compound_found = "y"
+ """blasthitdict = {}
+ blastdetailsdict = {}
+ querygenes = []
+ revblasthitdict = {}
+ hitgenes = []
+ for i in blasthitslines:
+ tabs = i.split("\t")
+ if blasthitdict.has_key(tabs[0]):
+ hits = blasthitdict[tabs[0]]
+ hits.append(tabs[1])
+ blasthitdict[tabs[0]] = hits
+ if revblasthitdict.has_key(tabs[1]):
+ revhits = revblasthitdict[tabs[1]]
+ revhits.append(tabs[0])
+ revblasthitdict[tabs[1]] = revhits
+ else:
+ revblasthitdict[tabs[1]] = [tabs[0]]
+ blastdetailsdict[tabs[0] + "_|_|_" + tabs[1]] = [tabs[5],tabs[3]]
+ if tabs[0] not in querygenes:
+ querygenes.append(tabs[0])
+ hitgenes.append(tabs[1])
+ else:
+ blasthitdict[tabs[0]] = [tabs[1]]
+ if revblasthitdict.has_key(tabs[1]):
+ revhits = revblasthitdict[tabs[1]]
+ revhits.append(tabs[0])
+ revblasthitdict[tabs[1]] = revhits
+ else:
+ revblasthitdict[tabs[1]] = [tabs[0]]
+ blastdetailsdict[tabs[0] + "_|_|_" + tabs[1]] = [tabs[5],tabs[3]]
+ if tabs[0] not in querygenes:
+ querygenes.append(tabs[0])
+ hitgenes.append(tabs[1])
+ """
+ #Make groups of genes for coloring
+ colorgroups = []
+ internalgroups = internalhomologygroupsdict[x]
+ for i in internalgroups:
+ querygenes_and_hits = []
+ for j in i:
+ #Make list of query gene and its hits
+ additionalhits = []
+ #For each hit, check if it was also hit by another gene; if so, only add it to the group if this hit had the lowest blast score
+ otherscores = []
+ queryscore = 0
+ if blasthitdict.has_key(j):
+ for k in blasthitdict[j]:
+ for l in blastdetailsdict.keys():
+ if k in l and j in l:
+ queryscore = blastdetailsdict[l][1]
+ elif k in l and j not in l:
+ otherscores.append(blastdetailsdict[l][1])
+ allscores = otherscores + [queryscore]
+ if queryscore == max(allscores):
+ additionalhits.append(k)
+ #Add additional hits to the querygenes_and_hits list that will form a colorgroup
+ querygenes_and_hits = querygenes_and_hits + additionalhits
+ if j not in querygenes_and_hits:
+ querygenes_and_hits.append(j)
+ if len(querygenes_and_hits) > 0:
+ colorgroups.append(querygenes_and_hits)
+ colorgroupsdict[hitclusternr] = colorgroups
+ hitclusterdata[hitclusternr] = [colorgroupsdict,hitclustergenes,hitclustergenesdetails,queryclustergenes,queryclustergenesdetails,toptenhitclusters,accession]
+ hitclusternr += 1
+ elif hitclusternr > 10 and hitclusternr <= 50:
+ blasthitslines = ((i.split("%coverage, e-value):\n")[1]).split("\n\n")[0]).split("\n")
+ querygeneswithhits = []
+ coregeneswithhits = []
+ for k in blasthitslines:
+ tabs = k.split("\t")
+ if tabs[0] not in querygeneswithhits:
+ querygeneswithhits.append( tabs[0] )
+ if tabs[0] in allcoregenes and tabs[0] not in coregeneswithhits:
+ coregeneswithhits.append(tabs[0])
+ for k in known_compound_dict.keys():
+ if k in i and compound_found == "n" and len(querygeneswithhits) > 2 and len(coregeneswithhits) > 0:
+ ws0.write(x,4,known_compound_dict[k])
+ compound_found = "y"
+ hitclusternr += 1
+ queryclusterdata[cblastclusternr] = [nrhitclusters,hitclusterdata]
+ cblastclusternr += 1
+ wb.save(genomename + "/" + genomename + ".geneclusters.xls")
+ #Gather and store data on each gene cluster
+ gtrcoglist = ['SMCOG1045','SMCOG1062','SMCOG1102']
+ transportercoglist = ['SMCOG1000','SMCOG1005','SMCOG1011','SMCOG1020','SMCOG1029','SMCOG1033','SMCOG1035','SMCOG1044','SMCOG1065','SMCOG1067','SMCOG1069','SMCOG1074','SMCOG1085','SMCOG1096','SMCOG1106','SMCOG1118','SMCOG1131','SMCOG1166','SMCOG1169','SMCOG1184','SMCOG1202','SMCOG1205','SMCOG1214','SMCOG1234','SMCOG1243','SMCOG1245','SMCOG1252','SMCOG1254','SMCOG1288']
+ qgeneclusterdata = {}
+ if smcogs == "y":
+ smcogdict2 = {}
+ smcogdescriptions = {}
+ for i in smcogdict.keys():
+ if len(smcogdict[i]) > 0 and len(smcogdict[i][0]) > 0 and ":" in smcogdict[i][0][0]:
+ smcogdict2[i] = (smcogdict[i][0][0]).split(":")[0]
+ smcogdescriptions[(smcogdict[i][0][0]).split(":")[0]] = (smcogdict[i][0][0]).split(":")[1]
+ elif len(smcogdict[i]) > 0:
+ smcogdict2[i] = smcogdict[i][0][0]
+ smcogdict = smcogdict2
+ for genecluster in geneclusters:
+ clustergenes = clusterinfo[genecluster][4]
+ clustergenes2 = []
+ #for i in clustergenes:
+ # clustergenes2.append(i[4])
+ [clustergenes2.append(i[4]) for i in clustergenes]
+ clustergenes = clustergenes2
+ clusternr = 1
+ clustertype = clusterinfo[genecluster][0]
+ annotations = {}
+ colors = []
+ starts = []
+ ends = []
+ strands = []
+ pksnrpsprots = []
+ gtrs = []
+ transporters = []
+ for j in clustergenes:
+ annotations[j] = proteins[3][j][3]
+ starts.append(int(proteins[3][j][0]))
+ ends.append(int(proteins[3][j][1]))
+ strands.append(proteins[3][j][2])
+ if j in allcoregenes:
+ colors.append("#810E15")
+ else:
+ colors.append("grey")
+ if j in pksnrpscoregenes:
+ pksnrpsprots.append(j)
+ if smcogs == "y":
+ if smcogdict.has_key(j) and len(smcogdict[j]) > 0 :
+ if smcogdict[j][0] in gtrcoglist:
+ gtrs.append(j)
+ if smcogdict[j][0] in transportercoglist:
+ transporters.append(j)
+ clustersize = max(ends) - min(starts)
+ if clusterblast == "n":
+ nrhitgeneclusters = {}
+ for i in geneclusters:
+ nrhitgeneclusters[i] = 0
+ hitgeneclusters = range(1,(nrhitgeneclusters[genecluster] + 1))
+ hitgeneclusterdata = {}
+ hitgeneclusterdata[genecluster] = [hitgeneclusters]
+ pksnrpsprotsnames = nrpspkstypedict
+ pksnrpsdomains = {}
+ domlist = []
+ domsdetails = {}
+ substrspecnrpspredictordict = {}
+ substrspecminowadict = {}
+ substrspecpkssigdict = {}
+ substrspecconsensusdict = {}
+ krpredictionsdict = {}
+ for i in pksnrpsprots:
+ domlist = []
+ domsdetails = {}
+ doms = domaindict[i]
+ for j in doms:
+ nr = 1
+ while j[0] + str(nr) in domlist:
+ nr += 1
+ domname = j[0] + str(nr)
+ domlist.append(domname)
+ domsdetails[domname] = [j[1],j[2]]
+ if "AMP-binding" in domname or "A-OX" in domname:
+ domname2 = i + "_" + "A" + str(nr)
+ substrspecminowadict[domname2] = minowa_nrps_preds[i + "_A" + str(nr)]
+ substrspecnrpspredictordict[domname2] = [nrps_code_preds[i + "_A" + str(nr)],nrps_svm_preds[i + "_A" + str(nr)]]
+ substrspecconsensusdict[domname2] = consensuspreds[i + "_A" + str(nr)]
+ if "PKS_AT" in domname:
+ domname2 = i + "_" + "AT" + str(nr)
+ substrspecminowadict[domname2] = minowa_pks_preds[i + "_AT" + str(nr)]
+ substrspecpkssigdict[domname2] = pks_code_preds[i + "_AT" + str(nr)]
+ substrspecconsensusdict[domname2] = consensuspreds[i + "_AT" + str(nr)]
+ if "CAL_domain" in domname:
+ domname2 = i + "_" + "CAL" + str(nr)
+ substrspecminowadict[domname2] = minowa_cal_preds[i + "_CAL" + str(nr)]
+ substrspecconsensusdict[domname2] = consensuspreds[i + "_CAL" + str(nr)]
+ if "CAL_domain" in domname:
+ domname2 = i + "_" + "CAL" + str(nr)
+ substrspecminowadict[domname2] = minowa_cal_preds[i + "_CAL" + str(nr)]
+ substrspecconsensusdict[domname2] = consensuspreds[i + "_CAL" + str(nr)]
+ if "PKS_KR" in domname:
+ domname2 = i + "_" + "KR" + str(nr)
+ krpredictionsdict[domname2] = [kr_activity_preds[i + "_KR" + str(nr)],kr_stereo_preds[i + "_KR" + str(nr)]]
+ pksnrpsdomains[i] = [domlist,domsdetails]
+ if compound_pred_dict.has_key(genecluster):
+ structpred = compound_pred_dict[genecluster]
+ else:
+ structpred = "N/A"
+ qgeneclusterdata[genecluster] = [clustertype,clustersize,clustergenes,annotations,starts,ends,strands,pksnrpsprots,pksnrpsprotsnames,pksnrpsdomains,substrspecnrpspredictordict,substrspecminowadict,substrspecpkssigdict,substrspecconsensusdict,gtrs,transporters,colors,hitgeneclusterdata,structpred,krpredictionsdict]
+
+ #Create genecluster svg for each gene cluster
+ geneposdict = {}
+ for qclusternr in geneclusters:
+ data = qgeneclusterdata[qclusternr]
+ #Some of the below 23 lines may already be internal to script, scan to remove unnecessary data fetching
+ clustertype = data[0]
+ clustersize = data[1]
+ genes = data[2]
+ annotations = data[3]
+ starts = data[4]
+ ends = data[5]
+ strands = data[6]
+ pksnrpsprots = data[7]
+ pksnrpsprotsnames = data[8]
+ pksnrpsdomains = data[9]
+ substrspecnrpspredictordict = data[10]
+ substrspecminowadict = data[11]
+ substrspecpkssigdict = data[12]
+ substrspecconsensusdict = data[13]
+ gtrs = data[14]
+ transporters = data[15]
+ colors = data[16]
+ hitgeneclusterdata = data[17]
+ structpred = data[18]
+ krpredictionsdict = data[19]
+ relpositions = relativepositions(starts,ends,clustersize)
+ rel_starts = relpositions[0]
+ rel_ends = relpositions[1]
+ y = 0
+ for i in genes:
+ geneposdict[i] = [starts[y],ends[y]]
+ y += 1
+ s = geneclustersvg(genes,rel_starts,rel_ends,strands,geneposdict,pksnrpsprots,pksnrpsdomains,qclusternr)
+ outfile = open(svgfolder + "genecluster" + str(qclusternr) + ".svg","w")
+ outfile.write(s.getXML())
+ outfile.close()
+ #Create ClusterBlast svg
+ if clusterblast == "y":
+ clusterblastpositiondata = {}
+ #Create alignment svg for each pair of hit&query
+ for i in geneclusters:
+ hitclusters = range(queryclusterdata[i][0] + 1)[1:]
+ #Create svgs for pairwise gene cluster alignment
+ colorschemedict,rgbcolorscheme = calculate_colorgroups(i,hitclusters,queryclusterdata,internalhomologygroupsdict)
+ for k in hitclusters:
+ cresults = clusterblastresults(i,[k],queryclusterdata,colorschemedict,rgbcolorscheme)
+ s = cresults[0]
+ clusterblastpositiondata[str(i) + "_"+str(k)] = cresults[1]
+ outfile = open(svgfolder + "clusterblast" + str(i) + "_" + str(k) + ".svg","w")
+ outfile.write(s.getXML())
+ outfile.close()
+ #Create svgs for multiple gene cluster alignment
+ cresults = clusterblastresults(i,hitclusters,queryclusterdata,colorschemedict,rgbcolorscheme)
+ s = cresults[0]
+ clusterblastpositiondata[str(i) + "_all"] = cresults[1]
+ outfile = open(svgfolder + "clusterblast" + str(i) + "_all.svg","w")
+ outfile.write(s.getXML())
+ outfile.close()
+
+ #Create folder for SEARCHGTR HTML files, load search form template
+ formtemplate = open("search_form.html","r")
+ formtemplate = formtemplate.read()
+ formtemplate = formtemplate.replace("\r","\n")
+ formtemplateparts = formtemplate.split("FASTASEQUENCE")
+ #Create HTML file with gene cluster info in hidden div tags
+ htmlfile = open("empty.xhtml","r")
+ html = htmlfile.read()
+ html = html.replace("\r","\n")
+ htmlparts = html.split("")
+ htmloutfile = open(genomename + "/display.xhtml","w")
+ htmloutfile.write(htmlparts[0])
+ #Add lines toreload all svgs up front
+ for qclusternr in geneclusters:
+ htmloutfile.write(' loadsvg(' + str(qclusternr) + ');\n')
+ if clusterblast == "y":
+ cblastclusters = [1,2,3,4,5,6,7,8,9,10]
+ for qclusternr in geneclusters:
+ nrhitclusters = queryclusterdata[qclusternr][0]
+ for j in range(nrhitclusters):
+ htmloutfile.write(' loadcblastsvg(' + str(qclusternr) + ',' + str(j+1) + ');\n')
+ #For each gene cluster, add hidden div tags for gene names, add hidden div tags for NRPS/PKS domains, add hidden div tags for ClusterBLAST depictions
+ htmloutfile.write(htmlparts[1])
+ for qclusternr in geneclusters:
+ data = qgeneclusterdata[qclusternr]
+ pksnrpsprots = data[7]
+ pksnrpsprotsnames = data[8]
+ pksnrpsdomains = data[9]
+ a = 0
+ for i in pksnrpsprots:
+ for j in pksnrpsdomains[i][0]:
+ htmloutfile.write(' $("#b' + str(qclusternr) + '_00' + str(a) + '_div").hide();\n')
+ a += 1
+ htmloutfile.write(htmlparts[2])
+ #Add top menu
+ gifdict = {"t1pks":"16","t2pks":"17","t3pks":"18","t4pks":"20","nrps":"10","amglyccycl":"1","bcin":"2","blactam":"3","butyrolactone":"4","ectoine":"5","terpene":"19","indole":"7","lant":"8","melanin":"9","nucleoside":"12","other":"13","phosphoglycolipid":"14","siderophore":"15"}
+ htmloutfile.write('\n')
+ menubutton_nr = 1
+ nrclustercolumns = 1
+ for i in geneclusters:
+ if qgeneclusterdata[i][0] in gifdict.keys():
+ typenr = gifdict[qgeneclusterdata[i][0]]
+ elif "-" in qgeneclusterdata[i][0]:
+ typenr = "6"
+ else:
+ typenr = "13"
+ htmloutfile.write('\n')
+ if menubutton_nr == 22 or menubutton_nr == 49:
+ htmloutfile.write('
')
+ nrclustercolumns += 1
+ menubutton_nr += 1
+
+ #Add gene cluster description
+ htmloutfile.write(htmlparts[3])
+ extrapixelsdict = {}
+ for qclusternr in geneclusters:
+ data = qgeneclusterdata[qclusternr]
+ clustertype = data[0]
+ clustersize = data[1]
+ genes = data[2]
+ annotations = data[3]
+ starts = data[4]
+ ends = data[5]
+ strands = data[6]
+ pksnrpsprots = data[7]
+ pksnrpsprotsnames = data[8]
+ pksnrpsdomains = data[9]
+ substrspecnrpspredictordict = data[10]
+ substrspecminowadict = data[11]
+ substrspecpkssigdict = data[12]
+ substrspecconsensusdict = data[13]
+ gtrs = data[14]
+ transporters = data[15]
+ colors = data[16]
+ hitgeneclusterdata = data[17]
+ structpred = data[18]
+ krpredictionsdict = data[19]
+ relpositions = relativepositions(starts,ends,clustersize)
+ rel_starts = relpositions[0]
+ rel_ends = relpositions[1]
+ #Create genes overview pop-up HTMLs
+ genepopupoutfile = open(htmlfolder + "geneclustergenes" + str(qclusternr) + '.html',"w")
+ genepopupoutfile.write('\n\n\n\n\nOverview of gene cluster genes:
\n')
+ genepopupoutfile.write('Gene | Annotation | Start position | End position | Strand |
\n')
+ for i in genes:
+ genepopupoutfile.write('' + i + ' | ' + annotations[i].replace("_"," ") + ' | ' + str(starts[genes.index(i)]) + ' | ' + str(ends[genes.index(i)]) + ' | ' + strands[genes.index(i)] + ' |
\n')
+ genepopupoutfile.write('\n
Biosynthetic gene cluster signature gene domains detected:
\n')
+ genepopupoutfile.write('Gene | Detected domains | Bit scores | \n')
+ for i in genes:
+ if i in allcoregenes:
+ detected_doms = detecteddomainsdict[i]
+ for j in detected_doms:
+ genepopupoutfile.write('
' + i + ' | ' + str(j[0]) + ' | ' + str(j[1]) + ' | \n')
+ genepopupoutfile.write('\n
')
+ genepopupoutfile.write('\n\n\n')
+ genepopupoutfile.close()
+ #Add gene cluster description on top
+ if qclusternr == 1:
+ htmloutfile.write('')
+ else:
+ htmloutfile.write('\n\n
')
+ #Add menu bars 1 & 2
+ htmloutfile.write('
Gene cluster description
')
+ htmloutfile.write('
PKS/NRPS domain annotation
')
+ htmloutfile.write('
\n')
+ htmloutfile.write('
\n')
+ htmloutfile.write('
\n')
+ htmloutfile.write('
\n')
+ if screenwidth < 1280:
+ htmloutfile.write('
\n')
+ else:
+ htmloutfile.write('
\n')
+ htmloutfile.write("
Gene Cluster " + str(qclusternr) + ". Type = " + clustertype + ". Location: "+ str(starts[0]) + " - " + str(ends[-1]) + " nt. Click on genes for more information.")
+ if len(genomic_accnr) > 4:
+ htmloutfile.write('
GBK')
+ #Genes overview pop-up.
+ if len(clustertype) > 20:
+ htmloutfile.write('
')
+ htmloutfile.write('
Genes and detection info overview')
+ htmloutfile.write("
\n\n")
+ htmloutfile.write('
\n')
+ if nrclustercolumns > 1:
+ spacers = nrclustercolumns - 1
+ for i in range(spacers):
+ htmloutfile.write('
\n')
+ htmloutfile.write('
\n')
+ #Add gene pop-ups
+ a = 0
+ for i in genes:
+ htmloutfile.write('\n\n")
+ htmloutfile.write('
\n')
+ htmloutfile.write(i)
+ htmloutfile.write("
\n\n")
+ a += 1
+ #Early calculation of nr of domains to be able to fit structure prediction information of large NRPSs/PKSs
+ pksnrpsdomainnr = 0
+ krdomainnr = 0
+ adomainnr = 0
+ for i in pksnrpsprots:
+ doms = pksnrpsdomains[i][0]
+ first = "no"
+ nra = 0
+ nrat = 0
+ nrkr = 0
+ nrcal = 0
+ for j in doms:
+ if "AMP-binding" in j or "A-OX" in j:
+ j = "A"
+ nra += 1
+ adomainnr += 1
+ z = nra
+ if "KR" in j:
+ j = "KR"
+ nrkr += 1
+ krdomainnr += 1
+ z = nrkr
+ if "AT" in j and "docking" not in j:
+ j = "AT"
+ nrat += 1
+ pksnrpsdomainnr += 1
+ z = nrat
+ if "CAL" in j:
+ j = "CAL"
+ nrcal += 1
+ pksnrpsdomainnr += 1
+ z = nrcal
+ pixels = adomainnr * 50 + pksnrpsdomainnr * 40 + krdomainnr * 30 + (len(pksnrpsprots) * 16) + 375
+ extrapixels = pixels - (676 + len(pksnrpsprots) * 99)
+ if extrapixels < 0:
+ extrapixels = 0
+ extrapixelsdict[qclusternr] = extrapixels
+ #Add picture of predicted chemical structure
+ htmloutfile.write('
\n')
+ htmloutfile.write('
\n')
+ htmloutfile.write('
\n')
+ htmloutfile.write('
\n')
+ htmloutfile.write('
\n')
+ if screenwidth < 1280:
+ htmloutfile.write('
Predicted core structure
\n')
+ else:
+ htmloutfile.write('
Predicted core structure
\n')
+ htmloutfile.write('
\n')
+ htmloutfile.write('
\n')
+ htmloutfile.write('
\n')
+ if qclusternr in failedstructures:
+ htmloutfile.write('
\n')
+ elif " " in structpred:
+ htmloutfile.write('
\n')
+ else:
+ htmloutfile.write('
\n')
+ htmloutfile.write('
\n')
+ htmloutfile.write("Monomers prediction: " + structpred + "
\n")
+ if qclusternr in dockingdomainanalysis:
+ htmloutfile.write('
Docking domain analysis results.\n')
+ nrpsfound = "no"
+ pksnrpsdomainnr = 0
+ adomainnr = 0
+ krdomainnr = 0
+ for i in pksnrpsprots:
+ doms = pksnrpsdomains[i][0]
+ first = "no"
+ nra = 0
+ nrat = 0
+ nrkr = 0
+ nrcal = 0
+ for j in doms:
+ if "AMP-binding" in j or "A-OX" in j:
+ j = "A"
+ nra += 1
+ adomainnr += 1
+ z = nra
+ if "KR" in j:
+ j = "KR"
+ nrkr += 1
+ krdomainnr += 1
+ z = nrkr
+ if "AT" in j and "docking" not in j:
+ j = "AT"
+ nrat += 1
+ pksnrpsdomainnr += 1
+ z = nrat
+ if "CAL" in j:
+ j = "CAL"
+ nrcal += 1
+ pksnrpsdomainnr += 1
+ z = nrcal
+ prediction = "no"
+ domname = str(i) + "_" + str(j) + str(z)
+ if domname in substrspecnrpspredictordict.keys():
+ nrpsfound = "yes"
+ prediction = "yes"
+ if substrspecnrpspredictordict[domname][0] == "nrp":
+ if first == "no":
+ first = "yes"
+ htmloutfile.write(i + ':
')
+ htmloutfile.write('
NRPSPredictor code prediction, '+ str(j) + str(z) + ': ?\n')
+ else:
+ if first == "no":
+ first = "yes"
+ htmloutfile.write(i + ':
')
+ htmloutfile.write('
NRPSPredictor code prediction, '+ str(j) + str(z) + ': ' + substrspecnrpspredictordict[domname][0] + '\n')
+ if substrspecnrpspredictordict[domname][1] == "nrp":
+ if first == "no":
+ first = "yes"
+ htmloutfile.write(i + ':
')
+ htmloutfile.write('
NRPSPredictor SVM prediction, '+ str(j) + str(z) + ': ?\n')
+ else:
+ if first == "no":
+ first = "yes"
+ htmloutfile.write(i + ':
')
+ htmloutfile.write('
NRPSPredictor SVM prediction, '+ str(j) + str(z) + ': ' + substrspecnrpspredictordict[domname][1] + '\n')
+ if domname in substrspecminowadict.keys():
+ prediction = "yes"
+ if substrspecminowadict[domname] == "nrp" or substrspecminowadict[domname] == "pk":
+ if first == "no":
+ first = "yes"
+ htmloutfile.write(i + ':
')
+ htmloutfile.write('
Minowa prediction, '+ str(j) + str(z) + ': ?\n')
+ else:
+ if first == "no":
+ first = "yes"
+ htmloutfile.write(i + ':
')
+ htmloutfile.write('
Minowa prediction, '+ str(j) + str(z) + ': ' + substrspecminowadict[domname] + '\n')
+ if domname in substrspecpkssigdict.keys():
+ prediction = "yes"
+ if substrspecpkssigdict[domname] == "pk":
+ if first == "no":
+ first = "yes"
+ htmloutfile.write(i + ':
')
+ htmloutfile.write('
PKS code prediction, '+ str(j) + str(z) + ': ?\n')
+ else:
+ if first == "no":
+ first = "yes"
+ htmloutfile.write(i + ':
')
+ htmloutfile.write('
PKS code prediction, '+ str(j) + str(z) + ': ' + substrspecpkssigdict[domname] + '\n')
+ if domname in krpredictionsdict.keys():
+ if first == "no":
+ first = "yes"
+ htmloutfile.write(i + ':
')
+ htmloutfile.write('
KR activity, '+ str(j) + str(z) + ': ' + krpredictionsdict[domname][0] + "\n")
+ htmloutfile.write('
KR stereochemistry, '+ str(j) + str(z) + ': ' + krpredictionsdict[domname][1] + "\n")
+ #Add link to prediction details pop-up
+ if prediction == "yes":
+ htmloutfile.write('
Prediction details\n')
+ if nrpsfound == "yes":
+ htmloutfile.write('
Perform Norine peptide search')
+ htmloutfile.write('
')
+ if screenwidth < 1280:
+ htmloutfile.write('
File outputs
\n')
+ else:
+ htmloutfile.write('
Downloadable output files
\n')
+ htmloutfile.write('
\n')
+ htmloutfile.write('
\n')
+ htmloutfile.write('
')
+ if fullhmm == "y" or fullblast == "y":
+ htmloutfile.write('
Open EMBL summary file')
+ #htmloutfile.write('
Download CSV summary file')
+ if fullhmm == "y":
+ htmloutfile.write('
Sec. met. enriched genome regions')
+ htmloutfile.write('
Open XLS overview table')
+ htmloutfile.write('
')
+ htmloutfile.write("
\n\n")
+ #Add descriptions of NRPS/PKS genes
+ htmloutfile.write('
\n')
+ htmloutfile.write("
\n\n")
+ z = 1
+ for i in pksnrpsprots:
+ htmloutfile.write('
\n')
+ htmloutfile.write(i + " (" + pksnrpsprotsnames[i].lower() + ")")
+ htmloutfile.write("
\n\n")
+ z += 1
+ #Add NRPS/PKS domain pop-ups
+ longestprot = 0
+ protlengthdict = {}
+ for i in pksnrpsprots:
+ protlength = (geneposdict[i][1] - geneposdict[i][0]) / 3
+ protlengthdict[i] = protlength
+ if protlength > longestprot:
+ longestprot = protlength
+ try:
+ aa2pixelratio = longestprot * 0.75 / screenwidth
+ except:
+ aa2pixelratio = 0.1
+ a = 0
+ z = 1
+ for i in pksnrpsprots:
+ domainsdict = pksnrpsdomains[i][1]
+ nra = 0
+ nrat = 0
+ nrkr = 0
+ nrcal = 0
+ for j in pksnrpsdomains[i][0]:
+ startpos = domainsdict[j][0]
+ endpos = domainsdict[j][1]
+ htmloutfile.write('\n\n")
+ a += 1
+ z += 1
+ htmloutfile.write('
\n')
+
+ if clusterblast == "y":
+ #Write ClusterBlast divs with pictures and description pop-up tags
+ htmloutfile.write('
\n\n')
+ #Add menu bar 3
+ htmloutfile.write('
Homologous gene clusters
')
+ htmloutfile.write('
')
+ htmloutfile.write('
')
+ for qclusternr in geneclusters:
+ nrhitclusters = queryclusterdata[qclusternr][0]
+ hitclusterdata = queryclusterdata[qclusternr][1]
+ if qclusternr == 1:
+ htmloutfile.write('